[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20160923112726.5890-2-alexander.shishkin@linux.intel.com>
Date: Fri, 23 Sep 2016 14:27:21 +0300
From: Alexander Shishkin <alexander.shishkin@...ux.intel.com>
To: Peter Zijlstra <a.p.zijlstra@...llo.nl>
Cc: Ingo Molnar <mingo@...hat.com>, linux-kernel@...r.kernel.org,
vince@...ter.net, eranian@...gle.com,
Arnaldo Carvalho de Melo <acme@...radead.org>,
tglx@...utronix.de, ak@...ux.intel.com,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>
Subject: [RFC PATCH 1/6] perf: Move mlock accounting to ring buffer allocation
In order to be able to allocate perf ring buffers in non-mmap path, we
need to make sure we can still account the memory to the user and that
they don't exceed their mlock limit.
This patch moves ring buffer memory accounting down the rb_alloc() path
so that its callers won't have to worry about it. This also serves the
additional purpose of slightly cleaning up perf_mmap().
Signed-off-by: Alexander Shishkin <alexander.shishkin@...ux.intel.com>
---
kernel/events/core.c | 67 +++--------------------
kernel/events/internal.h | 5 +-
kernel/events/ring_buffer.c | 127 ++++++++++++++++++++++++++++++++++++++------
3 files changed, 123 insertions(+), 76 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7c0d263f6b..2e8a0e389b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4933,6 +4933,8 @@ void ring_buffer_put(struct ring_buffer *rb)
if (!atomic_dec_and_test(&rb->refcount))
return;
+ ring_buffer_unaccount(rb, false);
+
WARN_ON_ONCE(!list_empty(&rb->event_list));
call_rcu(&rb->rcu_head, rb_free_rcu);
@@ -4967,9 +4969,6 @@ static void perf_mmap_close(struct vm_area_struct *vma)
struct perf_event *event = vma->vm_file->private_data;
struct ring_buffer *rb = ring_buffer_get(event);
- struct user_struct *mmap_user = rb->mmap_user;
- int mmap_locked = rb->mmap_locked;
- unsigned long size = perf_data_size(rb);
if (event->pmu->event_unmapped)
event->pmu->event_unmapped(event);
@@ -4989,11 +4988,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
*/
perf_pmu_output_stop(event);
- /* now it's safe to free the pages */
- atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
- vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
-
- /* this has to be the last one */
+ /* now it's safe to free the pages; ought to be the last one */
rb_free_aux(rb);
WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
@@ -5054,19 +5049,6 @@ again:
}
rcu_read_unlock();
- /*
- * It could be there's still a few 0-ref events on the list; they'll
- * get cleaned up by free_event() -- they'll also still have their
- * ref on the rb and will free it whenever they are done with it.
- *
- * Aside from that, this buffer is 'fully' detached and unmapped,
- * undo the VM accounting.
- */
-
- atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
- vma->vm_mm->pinned_vm -= mmap_locked;
- free_uid(mmap_user);
-
out_put:
ring_buffer_put(rb); /* could be last */
}
@@ -5081,13 +5063,9 @@ static const struct vm_operations_struct perf_mmap_vmops = {
static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
struct perf_event *event = file->private_data;
- unsigned long user_locked, user_lock_limit;
- struct user_struct *user = current_user();
- unsigned long locked, lock_limit;
struct ring_buffer *rb = NULL;
unsigned long vma_size;
unsigned long nr_pages;
- long user_extra = 0, extra = 0;
int ret = 0, flags = 0;
/*
@@ -5158,7 +5136,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
}
atomic_set(&rb->aux_mmap_count, 1);
- user_extra = nr_pages;
goto accounting;
}
@@ -5195,49 +5172,24 @@ again:
goto unlock;
}
- user_extra = nr_pages + 1;
-
accounting:
- user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
-
- /*
- * Increase the limit linearly with more CPUs:
- */
- user_lock_limit *= num_online_cpus();
-
- user_locked = atomic_long_read(&user->locked_vm) + user_extra;
-
- if (user_locked > user_lock_limit)
- extra = user_locked - user_lock_limit;
-
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- locked = vma->vm_mm->pinned_vm + extra;
-
- if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
- !capable(CAP_IPC_LOCK)) {
- ret = -EPERM;
- goto unlock;
- }
-
WARN_ON(!rb && event->rb);
if (vma->vm_flags & VM_WRITE)
flags |= RING_BUFFER_WRITABLE;
if (!rb) {
- rb = rb_alloc(nr_pages,
+ rb = rb_alloc(vma->vm_mm, nr_pages,
event->attr.watermark ? event->attr.wakeup_watermark : 0,
event->cpu, flags);
- if (!rb) {
- ret = -ENOMEM;
+ if (IS_ERR_OR_NULL(rb)) {
+ ret = PTR_ERR(rb);
+ rb = NULL;
goto unlock;
}
atomic_set(&rb->mmap_count, 1);
- rb->mmap_user = get_current_user();
- rb->mmap_locked = extra;
ring_buffer_attach(event, rb);
@@ -5246,15 +5198,10 @@ accounting:
} else {
ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
event->attr.aux_watermark, flags);
- if (!ret)
- rb->aux_mmap_locked = extra;
}
unlock:
if (!ret) {
- atomic_long_add(user_extra, &user->locked_vm);
- vma->vm_mm->pinned_vm += extra;
-
atomic_inc(&event->mmap_count);
} else if (rb) {
atomic_dec(&rb->mmap_count);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 486fd78eb8..a7ce82b670 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -36,6 +36,7 @@ struct ring_buffer {
atomic_t mmap_count;
unsigned long mmap_locked;
struct user_struct *mmap_user;
+ struct mm_struct *mmap_mapping;
/* AUX area */
local_t aux_head;
@@ -56,6 +57,7 @@ struct ring_buffer {
};
extern void rb_free(struct ring_buffer *rb);
+extern void ring_buffer_unaccount(struct ring_buffer *rb, bool aux);
static inline void rb_free_rcu(struct rcu_head *rcu_head)
{
@@ -74,7 +76,8 @@ static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause)
}
extern struct ring_buffer *
-rb_alloc(int nr_pages, long watermark, int cpu, int flags);
+rb_alloc(struct mm_struct *mm, int nr_pages, long watermark, int cpu,
+ int flags);
extern void perf_event_wakeup(struct perf_event *event);
extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
pgoff_t pgoff, int nr_pages, long watermark, int flags);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 257fa460b8..484ce09d96 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -496,6 +496,88 @@ void *perf_get_aux(struct perf_output_handle *handle)
return handle->rb->aux_priv;
}
+/*
+ * Check if the current user can afford @nr_pages, considering the
+ * perf_event_mlock sysctl and their mlock limit. If the former is exceeded,
+ * pin the remainder on their mm, if the latter is not sufficient either,
+ * error out. Otherwise, keep track of the pages used in the ring_buffer so
+ * that the accounting can be undone when the pages are freed.
+ */
+static int ring_buffer_account(struct ring_buffer *rb, struct mm_struct *mm,
+ unsigned long nr_pages, bool aux)
+{
+ unsigned long total, limit, pinned;
+
+ if (!mm)
+ mm = rb->mmap_mapping;
+
+ rb->mmap_user = current_user();
+
+ limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
+
+ /*
+ * Increase the limit linearly with more CPUs:
+ */
+ limit *= num_online_cpus();
+
+ total = atomic_long_read(&rb->mmap_user->locked_vm) + nr_pages;
+
+ pinned = 0;
+ if (total > limit) {
+ /*
+ * Everything that's over the sysctl_perf_event_mlock
+ * limit needs to be accounted to the consumer's mm.
+ */
+ if (!mm)
+ return -EPERM;
+
+ pinned = total - limit;
+
+ limit = rlimit(RLIMIT_MEMLOCK);
+ limit >>= PAGE_SHIFT;
+ total = mm->pinned_vm + pinned;
+
+ if ((total > limit) && perf_paranoid_tracepoint_raw() &&
+ !capable(CAP_IPC_LOCK)) {
+ return -EPERM;
+ }
+
+ if (aux)
+ rb->aux_mmap_locked = pinned;
+ else
+ rb->mmap_locked = pinned;
+
+ mm->pinned_vm += pinned;
+ }
+
+ if (!rb->mmap_mapping)
+ rb->mmap_mapping = mm;
+
+ /* account for user page */
+ if (!aux)
+ nr_pages++;
+
+ rb->mmap_user = get_current_user();
+ atomic_long_add(nr_pages, &rb->mmap_user->locked_vm);
+
+ return 0;
+}
+
+/*
+ * Undo the mlock pages accounting done in ring_buffer_account().
+ */
+void ring_buffer_unaccount(struct ring_buffer *rb, bool aux)
+{
+ unsigned long nr_pages = aux ? rb->aux_nr_pages : rb->nr_pages + 1;
+ unsigned long pinned = aux ? rb->aux_mmap_locked : rb->mmap_locked;
+
+ atomic_long_sub(nr_pages, &rb->mmap_user->locked_vm);
+ if (rb->mmap_mapping)
+ rb->mmap_mapping->pinned_vm -= pinned;
+
+ free_uid(rb->mmap_user);
+}
+
#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
static struct page *rb_alloc_aux_page(int node, int order)
@@ -565,11 +647,16 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
{
bool overwrite = !(flags & RING_BUFFER_WRITABLE);
int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
- int ret = -ENOMEM, max_order = 0;
+ int ret, max_order = 0;
if (!has_aux(event))
return -ENOTSUPP;
+ ret = ring_buffer_account(rb, NULL, nr_pages, true);
+ if (ret)
+ return ret;
+
+ ret = -ENOMEM;
if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
/*
* We need to start with the max_order that fits in nr_pages,
@@ -655,8 +742,11 @@ out:
void rb_free_aux(struct ring_buffer *rb)
{
- if (atomic_dec_and_test(&rb->aux_refcount))
+ if (atomic_dec_and_test(&rb->aux_refcount)) {
+ ring_buffer_unaccount(rb, true);
+
__rb_free_aux(rb);
+ }
}
#ifndef CONFIG_PERF_USE_VMALLOC
@@ -690,19 +780,22 @@ static void *perf_mmap_alloc_page(int cpu)
return page_address(page);
}
-struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+struct ring_buffer *rb_alloc(struct mm_struct *mm, int nr_pages, long watermark,
+ int cpu, int flags)
{
+ unsigned long size = offsetof(struct ring_buffer, data_pages[nr_pages]);
struct ring_buffer *rb;
- unsigned long size;
- int i;
-
- size = sizeof(struct ring_buffer);
- size += nr_pages * sizeof(void *);
+ int i, ret = -ENOMEM;
rb = kzalloc(size, GFP_KERNEL);
if (!rb)
goto fail;
+ ret = ring_buffer_account(rb, mm, nr_pages, false);
+ if (ret)
+ goto fail;
+
+ ret = -ENOMEM;
rb->user_page = perf_mmap_alloc_page(cpu);
if (!rb->user_page)
goto fail_user_page;
@@ -729,7 +822,7 @@ fail_user_page:
kfree(rb);
fail:
- return NULL;
+ return ERR_PTR(ret);
}
static void perf_mmap_free_page(unsigned long addr)
@@ -796,19 +889,23 @@ void rb_free(struct ring_buffer *rb)
schedule_work(&rb->work);
}
-struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+struct ring_buffer *rb_alloc(struct mm_struct *mm, int nr_pages, long watermark,
+ int cpu, int flags)
{
+ unsigned long size = offsetof(struct ring_buffer, data_pages[1]);
struct ring_buffer *rb;
- unsigned long size;
void *all_buf;
-
- size = sizeof(struct ring_buffer);
- size += sizeof(void *);
+ int ret = -ENOMEM;
rb = kzalloc(size, GFP_KERNEL);
if (!rb)
goto fail;
+ ret = ring_buffer_account(rb, mm, nr_pages, false);
+ if (ret)
+ goto fail;
+
+ ret = -ENOMEM;
INIT_WORK(&rb->work, rb_free_work);
all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
@@ -830,7 +927,7 @@ fail_all_buf:
kfree(rb);
fail:
- return NULL;
+ return ERR_PTR(ret);
}
#endif
--
2.9.3
Powered by blists - more mailing lists