linux-kernel - [RFC] rlimit exceed notification events

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <xunyh9ag254f.fsf@redhat.com>
Date:   Fri, 19 Aug 2016 17:41:20 +0300
From:   Yauheni Kaliuta <yauheni.kaliuta@...hat.com>
To:     linux-kernel@...r.kernel.org
Cc:     Aristeu Rozanski <aris@...hat.com>,
        Peter Zijlstra <peterz@...radead.org>,
        Ingo Molnar <mingo@...hat.com>,
        Arnaldo Carvalho de Melo <acme@...nel.org>,
        Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
        Steven Rostedt <rostedt@...dmis.org>
Subject: [RFC] rlimit exceed notification events

Hi!

At the moment there is no clear indication if a process exceeds resource
limit. In some cases the problematic syscall can return a error, in some cases
the process can be just killed.

I'm trying to implement some sort of monitoring of such events and have a
question, what way would be acceptable.

1) The straight forward solution would be to instrument every such a place with
a printk (something related implemented, for example, by
d977d56ce5b3e8842236f2f9e7483d4914c9592e).

It has some concerns about reliablity and performance (giving a way to flood
printk buffer because of bad application, for example).

2) Using tracepoints. I've used a simple program, which dup()s until gets the
error 3 times:

$ sudo ./perf record -e rlimit:rlimit_exceeded ./a.out
Couldn't dup file: Too many open files, iteration 1020
Couldn't dup file: Too many open files, iteration 1021
Couldn't dup file: Too many open files, iteration 1022
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.010 MB perf.data (3 samples) ]

$ sudo ./perf report                                  
# To display the perf.data header info, please use --header/--header-only options.
#
#
# Total Lost Samples: 0
#
# Samples: 3  of event 'rlimit:rlimit_exceeded'
# Event count (approx.): 3
#
# Overhead  Trace output                                            
# ........  ........................................................
#
   100.00%  RLIMIT NOFILE violation. Current 1024, requested Unknown

The code to demonstrate the idea below:

diff --git a/fs/file.c b/fs/file.c
index 6b1acdfe59da..a358de041ac4 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -947,6 +947,9 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes)
 		else
 			fput(file);
 	}
+	if (ret == -EMFILE)
+		rlimit_exceeded(RLIMIT_NOFILE,
+				rlimit(RLIMIT_NOFILE), (u64)-1);
 	return ret;
 }
 
diff --git a/include/linux/resource.h b/include/linux/resource.h
index 5bc3116e649c..45a3654991aa 100644
--- a/include/linux/resource.h
+++ b/include/linux/resource.h
@@ -9,5 +9,6 @@ struct task_struct;
 int getrusage(struct task_struct *p, int who, struct rusage __user *ru);
 int do_prlimit(struct task_struct *tsk, unsigned int resource,
 		struct rlimit *new_rlim, struct rlimit *old_rlim);
+void rlimit_exceeded(int rlimit_id, u64 cur, u64 req);
 
 #endif
diff --git a/kernel/Makefile b/kernel/Makefile
index e2ec54e2b952..30999d83a261 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    extable.o params.o \
 	    kthread.o sys_ni.o nsproxy.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
-	    async.o range.o smpboot.o
+	    async.o range.o smpboot.o rlimit.o
 
 obj-$(CONFIG_MULTIUSER) += groups.o
 
@@ -18,6 +18,8 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
 endif
 
+CFLAGS_rlimit.o := -I$(src)
+
 # Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip()
 # in coverage traces.
 KCOV_INSTRUMENT_softirq.o := n
diff --git a/kernel/rlimit.c b/kernel/rlimit.c
new file mode 100644
index 000000000000..dfb161217a2b
--- /dev/null
+++ b/kernel/rlimit.c
@@ -0,0 +1,11 @@
+
+#include <linux/resource.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace-rlimit.h"
+
+void rlimit_exceeded(int rlimit_id, u64 cur, u64 req)
+{
+	trace_rlimit_exceeded(rlimit_id, cur, req);
+}
+
diff --git a/kernel/trace-rlimit.h b/kernel/trace-rlimit.h
new file mode 100644
index 000000000000..01f725406bf5
--- /dev/null
+++ b/kernel/trace-rlimit.h
@@ -0,0 +1,42 @@
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rlimit
+
+#if !defined(_TRACE_RLIMIT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RLIMIT_H
+#include <linux/tracepoint.h>
+
+TRACE_DEFINE_ENUM(RLIMIT_NOFILE);
+
+TRACE_EVENT(rlimit_exceeded,
+
+	    TP_PROTO(int rlimit_id,
+		     unsigned long long cur,
+		     unsigned long long req),
+
+	    TP_ARGS(rlimit_id, cur, req),
+
+	    TP_STRUCT__entry(
+		    __field(int, rlimit_id)
+		    __field(unsigned long long, cur)
+		    __field(unsigned long long, req)
+		    ),
+	    TP_fast_assign(
+		    __entry->rlimit_id = rlimit_id;
+		    __entry->cur = cur;
+		    __entry->req = req;
+		    ),
+	    TP_printk("RLIMIT %s violation. Current %llu, requested %s",
+		      __print_symbolic(__entry->rlimit_id,
+				       { RLIMIT_NOFILE, "NOFILE" }),
+		      __entry->cur,
+		      __print_symbolic(__entry->req,
+				       {(unsigned long long)-1, "Unknown"}))
+	);
+
+#endif /* _TRACE_RLIMIT_H */
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace-rlimit
+#include <trace/define_trace.h>



3) Using perf infrastructure directly
3.1) Since it is a software event, it sounds reasonable to add it as a swevent
counter to use then as:

static void perf_event_rlimit(int rlimit_id, u64 cur, u64 req)
{
	struct pt_regs *regs;

	regs = task_pt_regs(current);
	perf_sw_event(PERF_COUNT_SW_RLIMIT, 1, regs, regs->ip);
}

void rlimit_exceeded(int rlimit_id, u64 cur, u64 req)
{
	perf_event_rlimit(rlimit_id, cur, req);
}


In this form it's not very useful and will require some extention to add rlimit
related information. It can be some sort of perf_sw_event_with_context()
function on the API level.

3.2) It should be possible to extend the core to generate own records. In the
patch below it will add own records to the for example CPU_CLOCK software
counter. Can be used for the own counter. The example:


diff --git a/fs/file.c b/fs/file.c
index 6b1acdfe59da..a358de041ac4 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -947,6 +947,9 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes)
 		else
 			fput(file);
 	}
+	if (ret == -EMFILE)
+		rlimit_exceeded(RLIMIT_NOFILE,
+				rlimit(RLIMIT_NOFILE), (u64)-1);
 	return ret;
 }
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2b6b43cc0dd5..125648038748 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1235,6 +1235,7 @@ extern void perf_event_enable(struct perf_event *event);
 extern void perf_event_disable(struct perf_event *event);
 extern void perf_event_disable_local(struct perf_event *event);
 extern void perf_event_task_tick(void);
+extern void perf_event_rlimit(int id, u64 cur, u64 req);
 #else /* !CONFIG_PERF_EVENTS: */
 static inline void *
 perf_aux_output_begin(struct perf_output_handle *handle,
@@ -1298,6 +1299,7 @@ static inline void perf_event_disable(struct perf_event *event)		{ }
 static inline int __perf_event_disable(void *info)			{ return -1; }
 static inline void perf_event_task_tick(void)				{ }
 static inline int perf_event_release_kernel(struct perf_event *event)	{ return 0; }
+static inline void perf_event_rlimit(int id, u64 cur, u64 req)		{ }
 #endif
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
diff --git a/include/linux/resource.h b/include/linux/resource.h
index 5bc3116e649c..45a3654991aa 100644
--- a/include/linux/resource.h
+++ b/include/linux/resource.h
@@ -9,5 +9,6 @@ struct task_struct;
 int getrusage(struct task_struct *p, int who, struct rusage __user *ru);
 int do_prlimit(struct task_struct *tsk, unsigned int resource,
 		struct rlimit *new_rlim, struct rlimit *old_rlim);
+void rlimit_exceeded(int rlimit_id, u64 cur, u64 req);
 
 #endif
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index c66a485a24ac..468724acea99 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -862,7 +862,14 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_SWITCH_CPU_WIDE		= 15,
 
-	PERF_RECORD_MAX,			/* non-ABI */
+	/*
+	 * Records RLIMITs violations
+	 *
+	 * A structure will be here
+	 */
+	PERF_RECORD_RLIMIT			= 16,
+
+	PERF_RECORD_MAX,	/* non-ABI */
 };
 
 #define PERF_MAX_STACK_DEPTH		127
diff --git a/kernel/Makefile b/kernel/Makefile
index e2ec54e2b952..63068a9c017b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    extable.o params.o \
 	    kthread.o sys_ni.o nsproxy.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
-	    async.o range.o smpboot.o
+	    async.o range.o smpboot.o rlimit.o
 
 obj-$(CONFIG_MULTIUSER) += groups.o
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1903b8f3a705..ad1bedb8716d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6277,6 +6277,63 @@ void perf_event_fork(struct task_struct *task)
 }
 
 /*
+ * rlimits violations tracking
+ */
+
+struct perf_rlimit_event {
+	struct perf_event_header        header;
+	u32                             rlimit_id;
+	u64				r_current;
+	u64                             r_requested;
+	u32				reserved;
+} __packed;
+
+static bool perf_event_rlimit_match(struct perf_event *event)
+{
+	return true;
+}
+
+static void perf_event_rlimit_output(struct perf_event *event, void *data)
+{
+	struct perf_rlimit_event *rlimit_event = data;
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	int ret;
+
+	if (!perf_event_rlimit_match(event))
+		return;
+
+	perf_event_header__init_id(&rlimit_event->header, &sample, event);
+
+	ret = perf_output_begin(&handle, event, rlimit_event->header.size);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, *rlimit_event);
+	perf_event__output_id_sample(event, &handle, &sample);
+	perf_output_end(&handle);
+}
+
+void perf_event_rlimit(int id, u64 cur, u64 req)
+{
+	struct perf_rlimit_event rlimit_event;
+
+	rlimit_event = (struct perf_rlimit_event) {
+		.header = {
+			.type = PERF_RECORD_RLIMIT,
+			.size = sizeof(rlimit_event),
+		},
+		.rlimit_id = id,
+		.r_current = cur,
+		.r_requested = req,
+	};
+
+	perf_iterate_sb(perf_event_rlimit_output,
+			&rlimit_event,
+			NULL);
+}
+
+/*
  * comm tracking
  */
 
diff --git a/kernel/rlimit.c b/kernel/rlimit.c
new file mode 100644
index 000000000000..3ac043ea5383
--- /dev/null
+++ b/kernel/rlimit.c
@@ -0,0 +1,8 @@
+
+#include <linux/perf_event.h>
+#include <linux/resource.h>
+
+void rlimit_exceeded(int rlimit_id, u64 cur, u64 req)
+{
+	perf_event_rlimit(rlimit_id, cur, req);
+}


3.3) Create a new pmu for type PERF_TYPE_RLIMIT with own events and own record.

All perf changes will require some special userspace and/or perf utility
changes.

4) Something else.


So, any input about upstreamable way of the feature would be appreciated.


-- 
WBR,
Yauheni Kaliuta