lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Tue, 17 Jan 2012 20:51:13 +0200 (EET)
From:	Pekka Enberg <penberg@...nel.org>
To:	Rik van Riel <riel@...hat.com>
cc:	Minchan Kim <minchan@...nel.org>, linux-mm <linux-mm@...ck.org>,
	LKML <linux-kernel@...r.kernel.org>, leonid.moiseichuk@...ia.com,
	kamezawa.hiroyu@...fujitsu.com, mel@....ul.ie, rientjes@...gle.com,
	KOSAKI Motohiro <kosaki.motohiro@...il.com>,
	Johannes Weiner <hannes@...xchg.org>,
	Marcelo Tosatti <mtosatti@...hat.com>,
	Andrew Morton <akpm@...ux-foundation.org>,
	Ronen Hod <rhod@...hat.com>,
	KOSAKI Motohiro <kosaki.motohiro@...fujitsu.com>
Subject: Re: [RFC 1/3] /dev/low_mem_notify

Hello,

Ok, so here's a proof of concept patch that implements sample-base 
per-process free threshold VM event watching using perf-like syscall ABI. 
I'd really like to see something like this that's much more extensible and 
clean than the /dev based ABIs that people have proposed so far.

 			Pekka

------------------->

>From a07f93fdca360b20daef4a5d66f2a5746f31f6a6 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@...nel.org>
Date: Tue, 17 Jan 2012 17:51:48 +0200
Subject: [PATCH] vmnotify: VM event notification system

This patch implements a new sys_vmnotify_fd() system call that returns a
pollable file descriptor that can be used to watch VM events.

For example, to watch for VM event when free memory is below 99% of available
memory using 1 second sample period, you'd do something like this:

     struct vmnotify_config config;
     struct vmnotify_event event;
     struct pollfd pollfd;
     int fd;

     config = (struct vmnotify_config) {
             .type                   = VMNOTIFY_TYPE_SAMPLE|VMNOTIFY_TYPE_FREE_THRESHOLD,
             .sample_period_ns       = 1000000000L,
             .free_threshold         = 99,
     };

     fd = sys_vmnotify_fd(&config);

     pollfd.fd               = fd;
     pollfd.events           = POLLIN;

     if (poll(&pollfd, 1, -1) < 0) {
             perror("poll failed");
             exit(1);
     }

     memset(&event, 0, sizeof(event));

     if (read(fd, &event, sizeof(event)) < 0) {
             perror("read failed");
             exit(1);
     }

Signed-off-by: Pekka Enberg <penberg@...nel.org>
---
  arch/x86/include/asm/unistd_64.h       |    2 +
  include/linux/vmnotify.h               |   44 ++++++
  mm/Kconfig                             |    6 +
  mm/Makefile                            |    1 +
  mm/vmnotify.c                          |  235 ++++++++++++++++++++++++++++++++
  tools/testing/vmnotify/vmnotify-test.c |   68 +++++++++
  6 files changed, 356 insertions(+), 0 deletions(-)
  create mode 100644 include/linux/vmnotify.h
  create mode 100644 mm/vmnotify.c
  create mode 100644 tools/testing/vmnotify/vmnotify-test.c

diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 0431f19..b0928cd 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -686,6 +686,8 @@ __SYSCALL(__NR_getcpu, sys_getcpu)
  __SYSCALL(__NR_process_vm_readv, sys_process_vm_readv)
  #define __NR_process_vm_writev			311
  __SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
+#define __NR_vmnotify_fd			312
+__SYSCALL(__NR_vmnotify_fd, sys_vmnotify_fd)

  #ifndef __NO_STUBS
  #define __ARCH_WANT_OLD_READDIR
diff --git a/include/linux/vmnotify.h b/include/linux/vmnotify.h
new file mode 100644
index 0000000..8f8642b
--- /dev/null
+++ b/include/linux/vmnotify.h
@@ -0,0 +1,44 @@
+#ifndef _LINUX_VMNOTIFY_H
+#define _LINUX_VMNOTIFY_H
+
+#include <linux/types.h>
+
+enum {
+	VMNOTIFY_TYPE_FREE_THRESHOLD	= 1ULL << 0,
+	VMNOTIFY_TYPE_SAMPLE		= 1ULL << 1,
+};
+
+struct vmnotify_config {
+	/*
+	 * Size of the struct for ABI extensibility.
+	 */
+	__u32		   size;
+
+	/*
+	 * Notification type bitmask
+	 */
+	__u64			type;
+
+	/*
+	 * Free memory threshold in percentages [1..99]
+	 */
+	__u32			free_threshold;
+
+	/*
+	 * Sample period in nanoseconds
+	 */
+	__u64			sample_period_ns;
+};
+
+struct vmnotify_event {
+	/* Size of the struct for ABI extensibility. */
+	__u32			size;
+
+	__u64			nr_avail_pages;
+
+	__u64			nr_swap_pages;
+
+	__u64			nr_free_pages;
+};
+
+#endif /* _LINUX_VMNOTIFY_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 011b110..6631167 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -373,3 +373,9 @@ config CLEANCACHE
  	  in a negligible performance hit.

  	  If unsure, say Y to enable cleancache
+
+config VMNOTIFY
+	bool "Enable VM event notification system"
+	default n
+	help
+	  If unsure, say N to disable vmnotify
diff --git a/mm/Makefile b/mm/Makefile
index 50ec00e..e1b5db3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -51,3 +51,4 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
  obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
  obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
  obj-$(CONFIG_CLEANCACHE) += cleancache.o
+obj-$(CONFIG_VMNOTIFY) += vmnotify.o
diff --git a/mm/vmnotify.c b/mm/vmnotify.c
new file mode 100644
index 0000000..6800450
--- /dev/null
+++ b/mm/vmnotify.c
@@ -0,0 +1,235 @@
+#include <linux/anon_inodes.h>
+#include <linux/vmnotify.h>
+#include <linux/syscalls.h>
+#include <linux/file.h>
+#include <linux/list.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+
+#define VMNOTIFY_MAX_FREE_THRESHOD	100
+
+struct vmnotify_watch {
+	struct vmnotify_config		config;
+
+	struct mutex			mutex;
+	bool				pending;
+	struct vmnotify_event		event;
+
+	/* sampling */
+	struct hrtimer			timer;
+
+	/* poll */
+	wait_queue_head_t		waitq;
+};
+
+static bool vmnotify_match(struct vmnotify_watch *watch, struct vmnotify_event *event)
+{
+	if (watch->config.type & VMNOTIFY_TYPE_FREE_THRESHOLD) {
+		u64 threshold;
+
+		if (!event->nr_avail_pages)
+			return false;
+
+		threshold = event->nr_free_pages * 100 / event->nr_avail_pages;
+		if (threshold > watch->config.free_threshold)
+			return false;
+	}
+
+	return true;
+}
+
+static void vmnotify_sample(struct vmnotify_watch *watch)
+{
+	struct vmnotify_event event;
+	struct sysinfo si;
+
+	memset(&event, 0, sizeof(event));
+
+	event.size		= sizeof(event);
+	event.nr_free_pages	= global_page_state(NR_FREE_PAGES);
+
+	si_meminfo(&si);
+	event.nr_avail_pages	= si.totalram;
+
+#ifdef CONFIG_SWAP
+	si_swapinfo(&si);
+	event.nr_swap_pages	= si.totalswap;
+#endif
+
+	if (!vmnotify_match(watch, &event))
+		return;
+
+	mutex_lock(&watch->mutex);
+
+	watch->pending = true;
+
+	memcpy(&watch->event, &event, sizeof(event));
+
+	mutex_unlock(&watch->mutex);
+}
+
+static enum hrtimer_restart vmnotify_timer_fn(struct hrtimer *hrtimer)
+{
+	struct vmnotify_watch *watch = container_of(hrtimer, struct vmnotify_watch, timer);
+	u64 sample_period = watch->config.sample_period_ns;
+
+	vmnotify_sample(watch);
+
+	hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
+
+	wake_up(&watch->waitq);
+
+	return HRTIMER_RESTART;
+}
+
+static void vmnotify_start_timer(struct vmnotify_watch *watch)
+{
+	u64 sample_period = watch->config.sample_period_ns;
+
+	hrtimer_init(&watch->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	watch->timer.function = vmnotify_timer_fn;
+
+	hrtimer_start(&watch->timer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED);
+}
+
+static unsigned int vmnotify_poll(struct file *file, poll_table *wait)
+{
+	struct vmnotify_watch *watch = file->private_data;
+	unsigned int events = 0;
+
+	poll_wait(file, &watch->waitq, wait);
+
+	mutex_lock(&watch->mutex);
+
+	if (watch->pending)
+		events |= POLLIN;
+
+	mutex_unlock(&watch->mutex);
+
+	return events;
+}
+
+static ssize_t vmnotify_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct vmnotify_watch *watch = file->private_data;
+	int ret = 0;
+
+	mutex_lock(&watch->mutex);
+
+	if (!watch->pending)
+		goto out_unlock;
+
+	if (copy_to_user(buf, &watch->event, sizeof(struct vmnotify_event))) {
+		ret = -EFAULT;
+		goto out_unlock;
+	}
+
+	ret = watch->event.size;
+
+	watch->pending = false;
+
+out_unlock:
+	mutex_unlock(&watch->mutex);
+
+	return ret;
+}
+
+static int vmnotify_release(struct inode *inode, struct file *file)
+{
+	struct vmnotify_watch *watch = file->private_data;
+
+	hrtimer_cancel(&watch->timer);
+
+	kfree(watch);
+
+	return 0;
+}
+
+static const struct file_operations vmnotify_fops = {
+	.poll		= vmnotify_poll,
+	.read		= vmnotify_read,
+	.release	= vmnotify_release,
+};
+
+static struct vmnotify_watch *vmnotify_watch_alloc(void)
+{
+	struct vmnotify_watch *watch;
+
+	watch = kzalloc(sizeof *watch, GFP_KERNEL);
+	if (!watch)
+		return NULL;
+
+	mutex_init(&watch->mutex);
+
+	init_waitqueue_head(&watch->waitq);
+
+	return watch;
+}
+
+static int vmnotify_copy_config(struct vmnotify_config __user *uconfig,
+				struct vmnotify_config *config)
+{
+	int ret;
+
+	ret = copy_from_user(config, uconfig, sizeof(struct vmnotify_config));
+	if (ret)
+		return -EFAULT;
+
+	if (!config->type)
+		return -EINVAL;
+
+	if (config->type & VMNOTIFY_TYPE_SAMPLE) {
+		if (config->sample_period_ns < NSEC_PER_MSEC)
+			return -EINVAL;
+	}
+
+	if (config->type & VMNOTIFY_TYPE_FREE_THRESHOLD) {
+		if (config->free_threshold > VMNOTIFY_MAX_FREE_THRESHOD)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+SYSCALL_DEFINE1(vmnotify_fd,
+		struct vmnotify_config __user *, uconfig)
+{
+	struct vmnotify_watch *watch;
+	struct file *file;
+	int err;
+	int fd;
+
+	watch = vmnotify_watch_alloc();
+	if (!watch)
+		return -ENOMEM;
+
+	err = vmnotify_copy_config(uconfig, &watch->config);
+	if (err)
+		goto err_free;
+
+	fd = get_unused_fd_flags(O_RDONLY);
+	if (fd < 0) {
+		err = fd;
+		goto err_free;
+	}
+
+	file = anon_inode_getfile("[vmnotify]", &vmnotify_fops, watch, O_RDONLY);
+	if (IS_ERR(file)) {
+		err = PTR_ERR(file);
+		goto err_fd;
+	}
+
+	fd_install(fd, file);
+
+	if (watch->config.type & VMNOTIFY_TYPE_SAMPLE)
+		vmnotify_start_timer(watch);
+
+	return fd;
+
+err_fd:
+	put_unused_fd(fd);
+err_free:
+	kfree(watch);
+	return err;
+}
diff --git a/tools/testing/vmnotify/vmnotify-test.c b/tools/testing/vmnotify/vmnotify-test.c
new file mode 100644
index 0000000..3c6b26d
--- /dev/null
+++ b/tools/testing/vmnotify/vmnotify-test.c
@@ -0,0 +1,68 @@
+#include "../../../include/linux/vmnotify.h"
+
+#if defined(__x86_64__)
+#include "../../../arch/x86/include/asm/unistd.h"
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <poll.h>
+
+static int sys_vmnotify_fd(struct vmnotify_config *config)
+{
+	config->size = sizeof(*config);
+
+	return syscall(__NR_vmnotify_fd, config);
+}
+
+int main(int argc, char *argv[])
+{
+	struct vmnotify_config config;
+	struct vmnotify_event event;
+	struct pollfd pollfd;
+	int i;
+	int fd;
+
+	config = (struct vmnotify_config) {
+		.type			= VMNOTIFY_TYPE_SAMPLE|VMNOTIFY_TYPE_FREE_THRESHOLD,
+		.sample_period_ns	= 1000000000L,
+		.free_threshold		= 99,
+	};
+
+	fd = sys_vmnotify_fd(&config);
+	if (fd < 0) {
+		perror("vmnotify_fd failed");
+		exit(1);
+	}
+
+	for (i = 0; i < 10; i++) {
+		pollfd.fd		= fd;
+		pollfd.events		= POLLIN;
+
+		if (poll(&pollfd, 1, -1) < 0) {
+			perror("poll failed");
+			exit(1);
+		}
+
+		memset(&event, 0, sizeof(event));
+
+		if (read(fd, &event, sizeof(event)) < 0) {
+			perror("read failed");
+			exit(1);
+		}
+
+		printf("VM event:\n");
+		printf("\tsize=%lu\n", event.size);
+		printf("\tnr_avail_pages=%Lu\n", event.nr_avail_pages);
+		printf("\tnr_swap_pages=%Lu\n", event.nr_swap_pages);
+		printf("\tnr_free_pages=%Lu\n", event.nr_free_pages);
+	}
+	if (close(fd) < 0) {
+		perror("close failed");
+		exit(1);
+	}
+
+	return 0;
+}
-- 
1.7.6.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists