[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20170228181343.16588-2-willy@infradead.org>
Date: Tue, 28 Feb 2017 10:13:42 -0800
From: Matthew Wilcox <willy@...radead.org>
To: linux-kernel@...r.kernel.org
Cc: Matthew Wilcox <mawilcox@...rosoft.com>, linux-mm@...ck.org,
linux-fsdevel@...r.kernel.org
Subject: [PATCH 1/2] Add XArray
From: Matthew Wilcox <mawilcox@...rosoft.com>
The xarray is an array of 2^BITS_PER_LONG pointers which handles its own
locking and memory allocation. It is intended to replace the radix tree.
Signed-off-by: Matthew Wilcox <mawilcox@...rosoft.com>
---
include/linux/xarray.h | 764 ++++++++++++++++++++++++++++
lib/Makefile | 3 +-
lib/xarray.c | 1305 ++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 2071 insertions(+), 1 deletion(-)
create mode 100644 include/linux/xarray.h
create mode 100644 lib/xarray.c
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
new file mode 100644
index 000000000000..646ff84b4444
--- /dev/null
+++ b/include/linux/xarray.h
@@ -0,0 +1,764 @@
+#ifndef _LINUX_XARRAY_H
+#define _LINUX_XARRAY_H
+/*
+ * eXtensible Arrays
+ * Copyright (c) 2017 Microsoft Corporation
+ * Author: Matthew Wilcox <mawilcox@...rosoft.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/**
+ * An eXtensible Array is an array of pointers indexed by an unsigned
+ * long. The xarray takes care of its own locking, using an irqsafe
+ * spinlock for operations that modify the array and RCU for operations
+ * which only read from the array.
+ *
+ * The xarray can store exceptional entries as well as pointers.
+ * Exceptional entries have a value between 0 and 2^(BITS_PER_LONG - 1).
+ * You cannot store arbitrary unsigned longs in the xarray as some
+ * values are reserved for internal use. It is better to avoid storing
+ * IS_ERR() pointers in the array as it is hard to distinguish between
+ * xa_store() having failed to allocate memory, and a previously successful
+ * store of the entry ERR_PTR(-ENOMEM) in the array.
+ *
+ * A freshly initialised xarray is full of NULL pointers. You can set
+ * the entry at any index by calling xa_store(), and get the value by
+ * calling xa_load(). There is no difference between an entry which has
+ * never been stored to and an entry which has most recently had NULL stored
+ * to it. You can conditionally update the value of an entry by calling
+ * xa_replace(). Each entry which isn't NULL can be tagged with up to three
+ * bits of extra information, accessed through xa_get_tag(), xa_set_tag() and
+ * xa_clear_tag(). You can copy batches of entries out of the array by
+ * calling xa_get_entries() or xa_get_tagged(). You can iterate over present
+ * entries in the array by calling xa_find(), xa_next() or xa_for_each().
+ *
+ * There are two levels of API provided. Normal and Advanced. Define
+ * XA_ADVANCED before including this file to get access to the extra API.
+ * The advanced API has sharp edges and you can easily corrupt an xarray.
+ */
+
+#include <linux/bug.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+
+/**
+ * struct xarray - The anchor of the XArray
+ * @xa_lock: Lock protecting writes to the array.
+ * @xa_flags: Internal XArray flags.
+ * @xa_head: The first pointer in the array.
+ *
+ * If all of the pointers in the array are NULL, @xa_head is a NULL pointer.
+ * If the only non-NULL pointer in the array is at index 0, @xa_head is that
+ * pointer. If any other pointer in the array is non-NULL, @xa_head points
+ * to an @xa_node.
+ */
+struct xarray {
+ spinlock_t xa_lock;
+ unsigned int xa_flags;
+ void __rcu *xa_head;
+};
+
+#define __XARRAY_INIT(name, flags) { \
+ .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \
+ .xa_flags = flags, \
+ .xa_head = NULL, \
+}
+
+#define XARRAY_INIT(name) __XARRAY_INIT(name, 0)
+#define XARRAY_FREE_INIT(name) __XARRAY_INIT(name, XA_FLAG_TRACK_FREE | 1)
+
+#define DEFINE_XARRAY(name) struct xarray name = XARRAY_INIT(name)
+
+/*
+ * The low three bits of the flag field are used for storing the tags.
+ *
+ * The TRACK_FREE flag indicates that the XA_FREE_TAG tag is used to track
+ * free entries in the xarray. If you set this flag, only tags 1 & 2
+ * are available for you to use.
+ */
+#define XA_FLAG_TRACK_FREE (1 << 3)
+
+void *xa_load(const struct xarray *, unsigned long index);
+void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
+void *xa_replace(struct xarray *, unsigned long index,
+ void *entry, void *old, gfp_t);
+
+typedef unsigned __bitwise xa_tag_t;
+#define XA_TAG_0 ((__force xa_tag_t)0U)
+#define XA_TAG_1 ((__force xa_tag_t)1U)
+#define XA_TAG_2 ((__force xa_tag_t)2U)
+
+#define XA_TAG_MAX XA_TAG_2
+#define XA_FREE_TAG XA_TAG_0
+
+/*
+ * The vast majority of users have a constant tag, so the compiler can
+ * optimise this away.
+ */
+static inline bool xa_bad_tag(xa_tag_t tag)
+{
+ return (WARN_ON_ONCE((__force unsigned)tag >
+ (__force unsigned)XA_TAG_MAX));
+}
+
+/**
+ * xa_tagged() - Inquire whether any entry in this array has a tag set
+ * @xa: Array
+ * @tag: Tag value
+ *
+ * Return: True if any entry has this tag set, false if no entry does.
+ */
+static inline bool xa_tagged(const struct xarray *xa, xa_tag_t tag)
+{
+ if (xa_bad_tag(tag))
+ return false;
+ return xa->xa_flags & (1 << (__force unsigned)tag);
+}
+
+bool __xa_get_tag(const struct xarray *, unsigned long index, xa_tag_t);
+void *__xa_set_tag(struct xarray *, unsigned long index, xa_tag_t);
+void *__xa_clear_tag(struct xarray *, unsigned long index, xa_tag_t);
+
+#define xa_get_tag(xa, index, tag) \
+ (!xa_bad_tag(tag) && __xa_get_tag(xa, index, tag))
+#define xa_set_tag(xa, index, tag) \
+ (xa_bad_tag(tag) ? ERR_PTR(-EINVAL) : __xa_set_tag(xa, index, tag))
+#define xa_clear_tag(xa, index, tag) \
+ (xa_bad_tag(tag) ? ERR_PTR(-EINVAL) : __xa_clear_tag(xa, index, tag))
+
+int xa_get_entries(const struct xarray *, unsigned long start, void **dst,
+ unsigned int n);
+int xa_get_tagged(const struct xarray *, unsigned long start, void **dst,
+ unsigned int n, xa_tag_t);
+
+/**
+ * xa_empty() - Determine if an array has any present entries
+ * @xa: Array
+ *
+ * Return: True if the array has no entries in it.
+ */
+static inline bool xa_empty(const struct xarray *xa)
+{
+ return xa->xa_head == NULL;
+}
+
+void *xa_find(const struct xarray *xa, unsigned long *index, unsigned long max);
+void *xa_next(const struct xarray *xa, unsigned long *index, unsigned long max);
+
+/**
+ * xa_for_each() - Iterate over a portion of an array
+ * @xa: Array
+ * @entry: Pointer retrieved from array
+ * @index: Index of pointer retrieved from the array
+ * @max: Maximum index to retrieve from array
+ *
+ * Initialise @index to the minimum index you want to retrieve from
+ * the array. During the iteration, @entry will have the value of the
+ * pointer stored in @xa at @index. The iteration will skip all NULL
+ * pointers in the array. You may modify @index during the
+ * iteration if you want to skip indices. It is safe to modify the
+ * array during the iteration. At the end of the iteration, @entry will
+ * be set to NULL and @index will have a value less than or equal to max.
+ *
+ * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n). You have
+ * to handle your own locking with xas_for_each(), and if you have to unlock
+ * after each iteration, it will also end up being O(n.log(n)).
+ */
+#define xa_for_each(xa, entry, index, max) \
+ for (entry = xa_find(xa, &index, max); entry; \
+ entry = xa_next(xa, &index, max))
+
+/**
+ * xa_mk_exceptional() - Create an exceptional entry
+ * @v: value to store in exceptional entry
+ */
+static inline void *xa_mk_exceptional(unsigned long v)
+{
+ WARN_ON(v > LONG_MAX);
+ return (void *)((v << 1) | 1);
+}
+
+/**
+ * xa_exceptional_value() - Get value stored in an exceptional entry
+ * @entry: Value stored in xarray
+ */
+static inline unsigned long xa_exceptional_value(void *entry)
+{
+ return (unsigned long)entry >> 1;
+}
+
+/**
+ * xa_is_exceptional() - Determine if an entry is exceptional
+ * @entry: Value stored in xarray
+ *
+ * Return: True if the entry is exceptional
+ */
+static inline bool xa_is_exceptional(void *entry)
+{
+ return (unsigned long)entry & 1;
+}
+
+#ifdef XA_ADVANCED
+
+#ifdef XA_DEBUG
+#define XA_BUG_ON(x) BUG_ON(x)
+#else
+#define XA_BUG_ON(x)
+#endif
+
+/* XXX: unused */
+typedef unsigned __bitwise xa_tags_t;
+
+#define xa_trylock(xa) spin_trylock(&(xa)->xa_lock);
+#define xa_lock(xa) spin_lock(&(xa)->xa_lock)
+#define xa_unlock(xa) spin_unlock(&(xa)->xa_lock)
+#define xa_lock_irq(xa) spin_lock_irq(&(xa)->xa_lock)
+#define xa_unlock_irq(xa) spin_unlock_irq(&(xa)->xa_lock)
+#define xa_lock_irqsave(xa, flags) \
+ spin_lock_irqsave(&(xa)->xa_lock, flags)
+#define xa_unlock_irqrestore(xa, flags) \
+ spin_unlock_irqrestore(&(xa)->xa_lock, flags)
+
+/*
+ * The xarray is constructed out of a set of 'chunks' of pointers. Choosing
+ * the best chunk size requires some tradeoffs. A power of two recommends
+ * itself so that we can arrange the chunks into a tree and navigate based
+ * purely on shifts and masks. Generally, the larger the better; as the
+ * number of slots per level of the tree increases, the less tall the tree
+ * needs to be. But that needs to be balanced against the memory consumption
+ * of each node. On a 64-bit system, xa_node is currently 576 bytes, and we
+ * get 7 of them per 4kB page. If we doubled the number of slots per node,
+ * we'd get only 3 nodes per 4kB page.
+ */
+#define XA_CHUNK_SHIFT 6
+#define XA_CHUNK_SIZE (1 << XA_CHUNK_SHIFT)
+#define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1)
+#define XA_MAX_TAGS 3
+#define XA_TAG_LONGS (XA_CHUNK_SIZE / BITS_PER_LONG)
+#define XA_PREALLOC_COUNT ((BITS_PER_LONG / XA_CHUNK_SHIFT) * 2 + 1)
+
+struct xa_node {
+ union {
+ unsigned char offset;
+ unsigned char max;
+ };
+ unsigned char shift;
+ unsigned char count;
+ unsigned char exceptional;
+ struct xa_node __rcu *parent;
+ struct xarray *array;
+ union {
+ struct list_head private_list;
+ struct rcu_head rcu_head;
+ };
+ unsigned long tags[XA_MAX_TAGS][XA_TAG_LONGS];
+ void __rcu *slots[XA_CHUNK_SIZE];
+};
+
+/*
+ * Entries in the xarray have three possible types:
+ * 1. Kernel pointers. These have the bottom two bits clear.
+ * 2. Exceptional entries. These have the bottom bit set.
+ * 3. Internal entries. These have the bottom two bits equal to 10b.
+ *
+ * Internal entries can only be observed if you're using the advanced
+ * API; the normal API will not expose them to the user.
+ *
+ * There are six subtypes of internal entries:
+ * 3a. Node entry. This entry points to a node closer to the edge.
+ * 3b. Retry entry. This indicates that the entry you're looking for is
+ * in the array, but it's been moved to a different node. Retry your
+ * lookup from the head of the array.
+ * 3c. Sibling entry. This indicates that the entry you're looking for
+ * is stored in a different slot in the same node.
+ * 3d. Cousin entry. This indicates that the entry you're looking for
+ * is stored in a slot in a different node.
+ * 3e. IDR NULL entry. The IDR distinguishes between allocated NULL pointers
+ * and free entries. The easiest way to support this in the xarray is to
+ * substitute an internal entry for the NULL pointer.
+ * 3f. Walk End entry. This entry is never found in the array. It is
+ * returned by iteration functions to signal that the iteration has
+ * finished.
+ *
+ * The head of the array never contains a retry, sibling or cousin entry;
+ * these entries can only be found in array nodes.
+ */
+
+static inline void *xa_mk_internal(unsigned long v)
+{
+ return (void *)((v << 2) | 2);
+}
+
+static inline unsigned long xa_internal_value(void *entry)
+{
+ return (unsigned long)entry >> 2;
+}
+
+static inline bool xa_is_internal(void *entry)
+{
+ return ((unsigned long)entry & 3) == 2;
+}
+
+static inline void *xa_mk_node(struct xa_node *node)
+{
+ return (void *)((unsigned long)node | 2);
+}
+
+static inline struct xa_node *xa_node(void *entry)
+{
+ return (struct xa_node *)((unsigned long)entry & ~3UL);
+}
+
+static inline bool xa_is_node(void *entry)
+{
+ return xa_is_internal(entry) && (unsigned long)entry > 4096;
+}
+
+static inline void *xa_mk_sibling(unsigned int offset)
+{
+ return xa_mk_internal(offset);
+}
+
+static inline unsigned long xa_sibling_offset(void *entry)
+{
+ return xa_internal_value(entry);
+}
+
+static inline bool xa_is_sibling(void *entry)
+{
+ return xa_is_internal(entry) && xa_sibling_offset(entry) < 256;
+}
+
+static inline void *xa_mk_cousin(unsigned long offset)
+{
+ return xa_mk_internal(offset + 256);
+}
+
+static inline unsigned long xa_cousin_offset(void *entry)
+{
+ return xa_internal_value(entry) - 256;
+}
+
+static inline bool xa_is_cousin(void *entry)
+{
+ return xa_is_internal(entry) && xa_cousin_offset(entry) < 256;
+}
+
+static inline bool xa_is_relative(void *entry)
+{
+ return xa_is_internal(entry) && xa_internal_value(entry) < 512;
+}
+
+/*
+ * Values 0-255 are reserved for sibling entries (0-62 actually used)
+ * Values 256-511 are reserved for cousin entries (0-62, 64 actually used)
+ * 515-1023 are available for use before we start packing siblings & cousins
+ * closer together.
+ */
+#define XA_IDR_NULL xa_mk_internal(512)
+#define XA_RETRY_ENTRY xa_mk_internal(513)
+#define XA_WALK_END xa_mk_internal(514)
+
+/*
+ * These are not array entries. They are found only in xas->xa_node and
+ * mean that the lookup should be restarted from the top. They must be
+ * distinct from NULL, any valid node pointer and the IS_ERR() range.
+ * Making them small means we can test for them cheaply.
+ */
+#define XA_WALK_RESTART ((struct xa_node *)1)
+#define XA_RESTART_FIND ((struct xa_node *)1)
+#define XA_RESTART_NEXT ((struct xa_node *)2)
+
+static inline bool xa_is_retry(void *entry)
+{
+ return unlikely(entry == XA_RETRY_ENTRY);
+}
+
+static inline bool xa_is_idr_null(void *entry)
+{
+ return entry == XA_IDR_NULL;
+}
+
+static inline bool xa_track_free(struct xarray *xa)
+{
+ return xa->xa_flags & XA_FLAG_TRACK_FREE;
+}
+
+static inline void *xa_head(const struct xarray *xa)
+{
+ return rcu_dereference_check(xa->xa_head,
+ lockdep_is_held(&xa->xa_lock));
+}
+
+static inline void *xa_head_locked(const struct xarray *xa)
+{
+ return rcu_dereference_protected(xa->xa_head,
+ lockdep_is_held(&xa->xa_lock));
+}
+
+static inline void *xa_entry(const struct xarray *xa,
+ const struct xa_node *node, unsigned int offset)
+{
+ XA_BUG_ON(offset >= XA_CHUNK_SIZE);
+ return rcu_dereference_check(node->slots[offset],
+ lockdep_is_held(&xa->xa_lock));
+}
+
+static inline void *xa_entry_locked(const struct xarray *xa,
+ const struct xa_node *node, unsigned int offset)
+{
+ XA_BUG_ON(offset >= XA_CHUNK_SIZE);
+ return rcu_dereference_protected(node->slots[offset],
+ lockdep_is_held(&xa->xa_lock));
+}
+
+static inline void *xa_parent(const struct xarray *xa,
+ const struct xa_node *node)
+{
+ return rcu_dereference_check(node->parent,
+ lockdep_is_held(&xa->xa_lock));
+}
+
+static inline void *xa_parent_locked(const struct xarray *xa,
+ const struct xa_node *node)
+{
+ return rcu_dereference_protected(node->parent,
+ lockdep_is_held(&xa->xa_lock));
+}
+
+static inline void *xa_deref_locked(struct xarray *xa, void __rcu **slot)
+{
+ return rcu_dereference_protected(*slot, lockdep_is_held(&xa->xa_lock));
+}
+
+typedef void (*xa_update_node_t)(struct xa_node *);
+
+/**
+ * xa_state - XArray operation state
+ * @xa_index: The index which this operation is currently about.
+ * @xa_shift: The shift of the node containing the entry we're interested in.
+ * @xa_slots: The number of slots occupied by that entry in that node.
+ * @xa_flags: Flags, see below
+ * @xa_offset: This entry's offset within the chunk of slots.
+ * @xa_node: The node containing this entry, or NULL if the entry is at
+ * xa_head, or XA_WALK_RESTART to start walking through the array
+ * from the head, or an IS_ERR pointer if an error occurred.
+ * @xa_alloc: One preallocated node.
+ * @xa_count: Number of entries added/deleted so far during this operation.
+ * @xa_exceptional: Number of exceptional entries added/deleted.
+ * @xa_update: Callback when updating a node.
+ *
+ * Some of this state may seem redundant, but some of it is input state and
+ * some is output state. For example, xa_shift is not equal to xa_node->shift
+ * until we have walked through the array to the correct xa_node.
+ */
+struct xa_state {
+ unsigned long xa_index;
+ unsigned char xa_shift;
+ unsigned char xa_slots;
+ unsigned char xa_offset;
+ unsigned char xa_flags;
+ struct xa_node *xa_node;
+ struct xa_node *xa_alloc;
+ long xa_count;
+ long xa_exceptional;
+ xa_update_node_t xa_update;
+};
+
+/*
+ * XAS_FLAG_LOOKUP - Find this index. If clear, it means we're searching for
+ * the next index. This only makes a difference if we see a multislot entry;
+ * if set, we move backwards to return the entry. If clear, we move forwards
+ * and find the next entry.
+ */
+#define XAS_FLAG_LOOKUP 1
+
+static inline int xas_error(const struct xa_state *xas)
+{
+ return IS_ERR(xas->xa_node) ? PTR_ERR(xas->xa_node) : 0;
+}
+
+static inline void xas_set_err(struct xa_state *xas, int err)
+{
+ XA_BUG_ON(err > 0);
+ xas->xa_node = ERR_PTR(err);
+}
+
+static inline bool xas_restart(const struct xa_state *xas)
+{
+ return unlikely(xas->xa_node == XA_WALK_RESTART);
+}
+
+static inline void xas_retry(struct xa_state *xas)
+{
+ xas->xa_flags |= XAS_FLAG_LOOKUP;
+ xas->xa_node = XA_WALK_RESTART;
+}
+
+static inline void xas_pause(struct xa_state *xas)
+{
+ xas->xa_node = XA_WALK_RESTART;
+}
+
+static inline void xas_jump(struct xa_state *xas, unsigned long index)
+{
+ xas->xa_index = index;
+ xas->xa_node = XA_WALK_RESTART;
+}
+
+void xas_init_range(struct xa_state *, unsigned long index,
+ unsigned char shift, unsigned char slots);
+void xas_destroy(struct xa_state *);
+bool xas_nomem(struct xa_state *, gfp_t);
+
+void *xas_load(const struct xarray *, struct xa_state *);
+void *xas_store(struct xarray *, struct xa_state *, void *entry);
+void *xas_create(struct xarray *, struct xa_state *);
+int xas_split(struct xarray *, struct xa_state *, unsigned int order);
+
+bool xas_get_tag(const struct xarray *, const struct xa_state *, xa_tag_t);
+void xas_set_tag(struct xarray *, const struct xa_state *, xa_tag_t);
+void xas_clear_tag(struct xarray *, const struct xa_state *, xa_tag_t);
+void xas_init_tags(struct xarray *, const struct xa_state *);
+void *xas_find_tag(const struct xarray *, struct xa_state *, unsigned long max,
+ xa_tag_t);
+
+void *xas_next_entry(const struct xarray *, struct xa_state *,
+ unsigned long max);
+void *__xas_prev_slot(const struct xarray *, struct xa_state *,
+ unsigned long min);
+
+/*
+ * xas_init() - Set up xarray operation state
+ * @xas: Array operation state.
+ * @index: Target of the operation.
+ */
+static inline void xas_init(struct xa_state *xas, unsigned long index)
+{
+ xas_init_range(xas, index, 0, 1);
+}
+
+/**
+ * xas_init_order() - Set up xarray operation state for a multislot entry
+ * @xas: Array operation state.
+ * @index: Target of the operation.
+ * @order: Entry occupies 2^@...er indices.
+ */
+static inline void xas_init_order(struct xa_state *xas, unsigned long index,
+ unsigned int order)
+{
+ unsigned char shift = order - (order % XA_CHUNK_SHIFT);
+ unsigned char slots = 1 << (order % XA_CHUNK_SHIFT);
+
+ index = ((index >> shift) & ~(slots - 1UL)) << shift;
+ xas_init_range(xas, index, shift, slots);
+}
+
+static inline void *xas_next(const struct xarray *xa, struct xa_state *xas,
+ unsigned long max)
+{
+ struct xa_node *node = xas->xa_node;
+ void *entry;
+
+ if (unlikely(!node) || IS_ERR(node))
+ return XA_WALK_END;
+ if (unlikely(node == XA_WALK_RESTART))
+ return xas_next_entry(xa, xas, max);
+
+ do {
+ xas->xa_index = (xas->xa_index |
+ ((1UL << node->shift) - 1)) + 1;
+ if (unlikely(xas->xa_index > max))
+ return XA_WALK_END;
+
+ if (unlikely(++xas->xa_offset == XA_CHUNK_SIZE))
+ return xas_next_entry(xa, xas, max);
+ entry = xa_entry(xa, node, xas->xa_offset);
+ if (unlikely(xa_is_internal(entry)))
+ return xas_next_entry(xa, xas, max);
+ } while (!entry);
+
+ return entry;
+}
+
+static inline unsigned int xas_chunk_tag(const struct xa_state *xas,
+ xa_tag_t tag)
+{
+ unsigned long *addr = xas->xa_node->tags[(__force unsigned)tag];
+
+ return find_next_bit(addr, XA_CHUNK_SIZE, xas->xa_offset);
+}
+
+static inline void *xas_next_tag(const struct xarray *xa, struct xa_state *xas,
+ unsigned long max, xa_tag_t tag)
+{
+ struct xa_node *node = xas->xa_node;
+ unsigned int offset;
+
+ if (unlikely(!node) || IS_ERR(node))
+ return XA_WALK_END;
+ if (unlikely(node == XA_WALK_RESTART))
+ return xas_find_tag(xa, xas, max, tag);
+
+ xas->xa_offset++;
+ xas->xa_index = (xas->xa_index | ((1UL << node->shift) - 1)) + 1;
+ if (unlikely(xas->xa_index > max))
+ return XA_WALK_END;
+ offset = xas_chunk_tag(xas, tag);
+ if (offset == XA_CHUNK_SIZE)
+ return xas_find_tag(xa, xas, max, tag);
+ if (offset != xas->xa_offset) {
+ xas->xa_index += (offset - xas->xa_offset) << node->shift;
+ xas->xa_offset = offset;
+ if (unlikely(xas->xa_index > max))
+ return XA_WALK_END;
+ }
+
+ return xa_entry(xa, node, offset);
+}
+
+static inline void *xas_prev_slot(const struct xarray *xa,
+ struct xa_state *xas, unsigned long min)
+{
+ struct xa_node *node = xas->xa_node;
+ void *entry;
+
+ if (xas->xa_index <= min || !node)
+ return XA_WALK_END;
+ if (unlikely(node == XA_WALK_RESTART ||
+ ++xas->xa_offset == XA_CHUNK_SIZE))
+ return __xas_prev_slot(xa, xas, min);
+ entry = xa_entry(xa, node, xas->xa_offset);
+ if (unlikely(xa_is_internal(entry)))
+ return __xas_prev_slot(xa, xas, min);
+ xas->xa_index -= 1UL << node->shift;
+ return entry;
+}
+
+/**
+ * xas_for_each() - Iterate over all present entries in this range
+ * @xa: Array
+ * @xas: Array operation state
+ * @entry: Pointer to use for iteration
+ * @max: Highest index to return
+ *
+ * The loop body will be invoked for each entry present in the xarray
+ * between the current xas position and @max. @entry will be set to
+ * the entry retrieved from the xarray. It is safe to delete entries
+ * from the array in the loop body.
+ */
+#define xas_for_each(xa, xas, entry, max) \
+ for (entry = xas_next(xa, xas, max); \
+ entry != XA_WALK_END; \
+ entry = xas_next(xa, xas, max))
+
+/**
+ * xas_for_each_slot() - Iterate over all slots in this range
+ * @xa: Array
+ * @xas: Array operation state
+ * @entry: Pointer to use for iteration
+ * @max: Highest index to return
+ *
+ * The loop body will be executed for each allocated slot in the xarray
+ * between the current xas position and @max. @entry will be set to
+ * the entry retrieved from the xarray. It is safe to delete entries
+ * from the array in the loop body.
+ */
+#define xas_for_each_slot(xa, xas, entry, max) \
+ for (entry = xas_next_slot(xa, xas); \
+ entry != XA_WALK_END; \
+ entry = xas_next_slot(xa, xas, max))
+
+/**
+ * xas_for_each_tag() - Iterate over all tagged entries in this range
+ * @xa: Array
+ * @xas: Array operation state
+ * @entry: Pointer to use for iteration
+ * @max: Highest index to return
+ *
+ * The loop body will be executed for each tagged entry in the xarray
+ * between the current xas position and @max. @entry will be set to
+ * the entry retrieved from the xarray. It is safe to delete entries
+ * from the array in the loop body.
+ */
+#define xas_for_each_tag(xa, xas, entry, max) \
+ for (entry = xas_next_tag(xa, xas, max, tag); \
+ entry != XA_WALK_END; \
+ entry = xas_next_tag(xa, xas, max, tag))
+
+/**
+ * xas_for_each_slot_rev() - Iterate over all slots in this range backwards
+ * @xa: Array
+ * @xas: Array operation state
+ * @entry: Pointer to use for iteration
+ * @min: Lowest index to return
+ *
+ * The loop body will be executed for each allocated slot in the xarray
+ * between the current xas position and @min. @entry will be set to
+ * the entry retrieved from the xarray. It is safe to delete entries
+ * from the array in the loop body.
+ */
+#define xas_for_each_slot_rev(xa, xas, entry, min) \
+ for (entry = xas_prev_slot(xa, xas); \
+ entry != XA_WALK_END; \
+ entry = xas_prev_slot(xa, xas, max))
+
+/**
+ * xas_store_for_each() - Iterate over all entries, then replace them
+ * @xa: Array
+ * @xas: Array operation state
+ * @entry: Pointer to use for iteration
+ * @index: Index to store new entry at
+ * @order: Order of new entry
+ * @new: New entry
+ *
+ * The loop body will be executed for each entry present in the range
+ * specified by @index and @order. After all entries have been processed,
+ * the @new entry will be atomically stored in the xarray.
+ * RCU readers may temporarily see retry entries. If you break out of the
+ * loop, no modifications will have been made to the xarray.
+ */
+#define xas_store_for_each(xa, xas, entry, index, order, new) \
+ for (entry = xas_store_begin(xa, xas, index, order); \
+ entry = xas_store_finish(xa, xas, index, order, new); )
+
+/**
+ * xas_split_for_each() - Create new entries to replace a multislot entry
+ * @xa: Array
+ * @xas: Array operation state
+ * @entry: Pointer to use for iteration
+ * @index: Index of entry to split
+ * @order: Order of new entries to create
+ *
+ * The loop body will be executed for each new entry present in the range
+ * specified by @index and @order. The loop will see the index of the new
+ * entry in @xas->xa_index. It should call xas_store() to set up each new
+ * entry. After the loop has successfully terminated, the new entries will
+ * be atomically stored in the xarray. RCU readers may temporarily see
+ * retry entries. If you break out of the loop, no modifications will have
+ * been made to the xarray and the temporary memory allocation will be freed
+ * by xas_destroy().
+ */
+#define xas_split_for_each(xa, xas, entry, index, order) \
+ for (entry = xas_split_begin(xa, xas, index, order); \
+ entry = xas_split_finish(xa, xas, index, order); )
+
+/* Really advanced. */
+extern struct kmem_cache *xa_node_cache;
+void xas_delete_node(struct xarray *, struct xa_state *);
+
+#endif /* XA_ADVANCED */
+
+extern void xarray_init(void);
+#endif /* _LINUX_XARRAY_H */
diff --git a/lib/Makefile b/lib/Makefile
index 43a80ec3bd10..d8744c25ab29 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -17,7 +17,7 @@ KCOV_INSTRUMENT_debugobjects.o := n
KCOV_INSTRUMENT_dynamic_debug.o := n
lib-y := ctype.o string.o vsprintf.o cmdline.o \
- rbtree.o radix-tree.o dump_stack.o timerqueue.o\
+ xarray.o rbtree.o radix-tree.o dump_stack.o timerqueue.o\
idr.o int_sqrt.o extable.o \
sha1.o chacha20.o md5.o irq_regs.o argv_split.o \
flex_proportions.o ratelimit.o show_mem.o \
@@ -26,6 +26,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
CFLAGS_radix-tree.o += -DCONFIG_SPARSE_RCU_POINTER
CFLAGS_idr.o += -DCONFIG_SPARSE_RCU_POINTER
+CFLAGS_xarray.o += -DCONFIG_SPARSE_RCU_POINTER
lib-$(CONFIG_MMU) += ioremap.o
lib-$(CONFIG_SMP) += cpumask.o
diff --git a/lib/xarray.c b/lib/xarray.c
new file mode 100644
index 000000000000..fd33b5b91013
--- /dev/null
+++ b/lib/xarray.c
@@ -0,0 +1,1305 @@
+#define XA_ADVANCED
+
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+
+#ifdef XA_DEBUG
+#define XA_BUG_ON(x) BUG_ON(x)
+#else
+#define XA_BUG_ON(x)
+#endif
+
+#define inc_tag(tag) do { \
+ tag = (__force xa_tag_t)((__force unsigned)(tag) + 1); \
+} while (0)
+
+static inline void xa_tag_set(struct xarray *xa, xa_tag_t tag)
+{
+ if (!(xa->xa_flags & (1 << (__force unsigned)tag)))
+ xa->xa_flags |= (1 << (__force unsigned)tag);
+}
+
+static inline void xa_tag_clear(struct xarray *xa, xa_tag_t tag)
+{
+ if (xa->xa_flags & (1 << (__force unsigned)tag))
+ xa->xa_flags &= ~(1 << (__force unsigned)tag);
+}
+
+static inline bool tag_get(const struct xa_node *node, unsigned int offset,
+ xa_tag_t tag)
+{
+ return test_bit(offset, node->tags[(__force unsigned)tag]);
+}
+
+static inline void tag_set(struct xa_node *node, unsigned int offset,
+ xa_tag_t tag)
+{
+ __set_bit(offset, node->tags[(__force unsigned)tag]);
+}
+
+static inline void tag_clear(struct xa_node *node, unsigned int offset,
+ xa_tag_t tag)
+{
+ __clear_bit(offset, node->tags[(__force unsigned)tag]);
+}
+
+static inline void tag_set_all(struct xa_node *node, xa_tag_t tag)
+{
+ bitmap_fill(node->tags[(__force unsigned)tag], XA_CHUNK_SIZE);
+}
+
+static inline bool tag_any_set(struct xa_node *node, xa_tag_t tag)
+{
+ return !bitmap_empty(node->tags[(__force unsigned)tag], XA_CHUNK_SIZE);
+}
+
+/*
+ * Use this to calculate the maximum index that will need to be created
+ * in order to add the entry described by @xas. Because we cannot store a
+ * multiple-slot entry at index 0, the calculation is a little more complex
+ * than you might expect.
+ */
+static unsigned long xas_max(struct xa_state *xas)
+{
+ unsigned long mask, max = xas->xa_index;
+
+ if (xas->xa_shift || xas->xa_slots > 1) {
+ mask = ((xas->xa_slots << xas->xa_shift) - 1);
+ max |= mask;
+ if (mask == max)
+ max++;
+ }
+ return max;
+}
+
+static unsigned long set_index(struct xa_node *node, unsigned int offset,
+ unsigned long index)
+{
+ unsigned long mask = ((unsigned long)XA_CHUNK_SIZE << node->shift) - 1;
+
+ return (index & ~mask) + ((unsigned long)offset << node->shift);
+}
+
+/* XXX: kill? */
+static unsigned int node_offset(struct xa_node *node, unsigned long index)
+{
+ return (index >> node->shift) & XA_CHUNK_MASK;
+}
+
+static unsigned long xas_offset(struct xa_state *xas)
+{
+ return (xas->xa_index >> xas->xa_node->shift) & XA_CHUNK_MASK;
+}
+
+/* Returns true if 'entry' can contain 'index' */
+static bool xa_bounds(unsigned long index, void *entry)
+{
+ struct xa_node *node;
+ unsigned int max;
+
+ if (!xa_is_node(entry))
+ return index == 0;
+ node = xa_node(entry);
+ max = node->max;
+ if (max == 0)
+ max = 63;
+ return (index >> node->shift) <= max;
+}
+
+static void *xas_start(const struct xarray *xa, struct xa_state *xas)
+{
+ struct xa_node *node = xas->xa_node;
+ void *entry;
+ unsigned long offset;
+
+ if (!xas_restart(xas)) {
+ if (node)
+ return xa_entry(xa, xas->xa_node, xas->xa_offset);
+ else
+ return xa_head(xa);
+ }
+
+ entry = xa_head(xa);
+ if (!xa_is_node(entry)) {
+ if (xas->xa_index)
+ return XA_WALK_END;
+ xas->xa_node = NULL;
+ return entry;
+ }
+
+ node = xa_node(entry);
+ if (xas->xa_shift > node->shift) {
+ xas->xa_node = NULL;
+ return entry;
+ }
+
+ offset = xas->xa_index >> node->shift;
+ if (offset > XA_CHUNK_MASK)
+ return XA_WALK_END;
+
+ xas->xa_node = node;
+ xas->xa_offset = offset;
+ return entry;
+}
+
+/**
+ * xas_init_range() - Initialise xarray operation state for a multislot entry
+ * @xas: Array operation state.
+ * @index: Eventual target of the operation.
+ * @shift: Shift of the node this will occupy
+ * @slots: Number of slots in that node to occupy
+ */
+void xas_init_range(struct xa_state *xas, unsigned long index,
+ unsigned char shift, unsigned char slots)
+{
+ xas->xa_index = index;
+ xas->xa_shift = shift;
+ xas->xa_slots = slots;
+ xas->xa_offset = 0;
+ xas->xa_flags = XAS_FLAG_LOOKUP;
+ xas->xa_node = XA_WALK_RESTART;
+ xas->xa_alloc = NULL;
+ xas->xa_count = 0;
+ xas->xa_exceptional = 0;
+ xas->xa_update = NULL;
+}
+EXPORT_SYMBOL_GPL(xas_init_range);
+
+struct kmem_cache *xa_node_cache;
+
+static void xa_node_ctor(void *p)
+{
+ struct xa_node *node = p;
+
+ memset(&node->tags, 0, sizeof(node->tags));
+ memset(&node->slots, 0, sizeof(node->slots));
+ INIT_LIST_HEAD(&node->private_list);
+}
+
+static void xa_node_rcu_free(struct rcu_head *head)
+{
+ struct xa_node *node = container_of(head, struct xa_node, rcu_head);
+
+ xa_node_ctor(node);
+ kmem_cache_free(xa_node_cache, node);
+}
+
+static void xa_node_free(struct xa_node *node)
+{
+ call_rcu(&node->rcu_head, xa_node_rcu_free);
+}
+
+/**
+ * xas_destroy() - Dispose of any resources used during the xarray operation
+ * @xas: Array operation state.
+ *
+ * If the operation only involved read accesses to the XArray or modifying
+ * existing data in the XArray, there is no need to call this function
+ * (eg xa_set_tag()). However, if you may have allocated memory (for
+ * example by calling xas_nomem()), then call this function.
+ *
+ * This function does not reinitialise the state, so you may continue to
+ * call xas_error(), and you would want to call xas_init() before reusing
+ * this structure. It only releases any resources.
+ */
+void xas_destroy(struct xa_state *xas)
+{
+ struct xa_node *node = xas->xa_alloc;
+
+ if (!node)
+ return;
+#if 0
+ while (node->count)
+ kmem_cache_free(xa_node_cache, node->slots[node->count - 1]);
+#endif
+ kmem_cache_free(xa_node_cache, node);
+ xas->xa_alloc = NULL;
+}
+EXPORT_SYMBOL_GPL(xas_destroy);
+
+/**
+ * xas_nomem() - Allocate memory if needed
+ * @xas: Array operation state
+ * @gfp: Memory allocation flags
+ *
+ * If we need to add new nodes to the xarray, we try to allocate memory
+ * with GFP_NOWAIT while holding the lock, which will usually succeed.
+ * If it fails, @xas is flagged as needing memory to continue. The caller
+ * should drop the lock and call xas_nomem(). If xas_nomem() succeeds,
+ * the caller should retry the operation.
+ *
+ * Forward progress is guaranteed as one node is allocated here and is
+ * available to the memory allocators.
+ *
+ * Return: true if memory was needed, and was successfully allocated.
+ */
+bool xas_nomem(struct xa_state *xas, gfp_t gfp)
+{
+ if (xas->xa_node != ERR_PTR(-ENOMEM))
+ return false;
+ xas->xa_alloc = kmem_cache_alloc(xa_node_cache, gfp);
+ if (!xas->xa_alloc)
+ return false;
+ xas->xa_node = XA_WALK_RESTART;
+ return true;
+}
+EXPORT_SYMBOL_GPL(xas_nomem);
+
+static void *xas_alloc(struct xarray *xa, struct xa_state *xas,
+ unsigned int shift)
+{
+ struct xa_node *parent = xas->xa_node;
+ struct xa_node *node = xas->xa_alloc;
+
+ if (IS_ERR(parent))
+ return NULL;
+
+ if (node) {
+ xas->xa_alloc = NULL;
+ } else {
+ node = kmem_cache_alloc(xa_node_cache,
+ GFP_NOWAIT | __GFP_NOWARN);
+ if (!node) {
+ xas->xa_node = ERR_PTR(-ENOMEM);
+ return NULL;
+ }
+ }
+
+ if (xas->xa_node) {
+ node->offset = xas->xa_offset;
+ parent->count++;
+ XA_BUG_ON(parent->count > XA_CHUNK_SIZE);
+ } else {
+ node->max = XA_CHUNK_MASK;
+ }
+ XA_BUG_ON(shift > BITS_PER_LONG);
+ node->shift = shift;
+ node->count = 0;
+ node->exceptional = 0;
+ RCU_INIT_POINTER(node->parent, xas->xa_node);
+ node->array = xa;
+
+ return node;
+}
+
+#if 0
+static void *xas_find_cousin(const struct xarray *xa, struct xa_state *xas)
+{
+ struct xa_node *node = xas->xa_node;
+ unsigned int offset = xas->xa_offset;
+ void *entry;
+
+ while (offset == 0) {
+ offset = node->offset;
+ node = xa_parent(xa, node);
+ XA_BUG_ON(!node);
+ }
+
+ offset--;
+
+ for (;;) {
+ entry = xa_entry(xa, node, offset);
+
+ if (xa_is_sibling(entry)) {
+ offset = xa_sibling_offset(entry);
+ entry = xa_entry(xa, node, offset);
+ }
+
+ if (!xa_is_node(entry))
+ break;
+ node = xa_node(entry);
+ offset = XA_CHUNK_SIZE - 1;
+ }
+
+ xas->xa_node = node;
+ xas->xa_offset = offset;
+ return entry;
+}
+ } else if (unlikely(xa_cousin_entry(entry))) {
+ return xas_find_cousin(xa, xas);
+#endif
+
+static void *xas_descend(const struct xarray *xa, struct xa_state *xas,
+ struct xa_node *node)
+{
+ unsigned int offset = node_offset(node, xas->xa_index);
+ void *entry = xa_entry(xa, node, offset);
+
+ if (xa_is_sibling(entry)) {
+ offset = xa_sibling_offset(entry);
+ entry = xa_entry(xa, node, offset);
+ }
+
+ xas->xa_node = node;
+ xas->xa_offset = offset;
+ return entry;
+}
+
+/**
+ * xas_get_tag() - Returns the state of this tag
+ * @xa: Array
+ * @xas: Array operation state
+ * @tag: Tag value
+ *
+ * Return: true if the tag is set, false if the tag is clear.
+ */
+bool xas_get_tag(const struct xarray *xa, const struct xa_state *xas,
+ xa_tag_t tag)
+{
+ if (xas_restart(xas) || xas_error(xas))
+ return false;
+ if (!xas->xa_node)
+ return xa_tagged(xa, tag);
+ return tag_get(xas->xa_node, xas->xa_offset, tag);
+}
+EXPORT_SYMBOL_GPL(xas_get_tag);
+
+/**
+ * xas_set_tag() - Sets the tag on this entry and its parents
+ * @xa: Array
+ * @xas: Array operation state
+ * @tag: Tag value
+ *
+ * Sets the specified tag on this entry, and walks up the tree setting it
+ * on all the ancestor entries.
+ */
+void xas_set_tag(struct xarray *xa, const struct xa_state *xas, xa_tag_t tag)
+{
+ struct xa_node *node = xas->xa_node;
+ unsigned int offset = xas->xa_offset;
+
+ if (IS_ERR(node))
+ return;
+
+ while (node) {
+ if (tag_get(node, offset, tag))
+ return;
+ tag_set(node, offset, tag);
+ offset = node->offset;
+ node = xa_parent(xa, node);
+ }
+
+ if (!xa_tagged(xa, tag))
+ xa_tag_set(xa, tag);
+}
+EXPORT_SYMBOL_GPL(xas_set_tag);
+
+/**
+ * xas_clear_tag() - Clears the tag on this entry and its parents
+ * @xa: Array
+ * @xas: Array operation state
+ * @tag: Tag value
+ *
+ * Clears the specified tag on this entry, and walks up the tree
+ * attempting to clear it on all the ancestor entries. A tag may
+ * only be cleared on an ancestor entry if none of its children
+ * have that tag set.
+ */
+void xas_clear_tag(struct xarray *xa, const struct xa_state *xas, xa_tag_t tag)
+{
+ struct xa_node *node = xas->xa_node;
+ unsigned int offset = xas->xa_offset;
+
+ if (IS_ERR(node))
+ return;
+
+ while (node) {
+ if (!tag_get(node, offset, tag))
+ return;
+ tag_clear(node, offset, tag);
+ if (tag_any_set(node, tag))
+ return;
+
+ offset = node->offset;
+ node = xa_parent(xa, node);
+ }
+
+ if (xa_tagged(xa, tag))
+ xa_tag_clear(xa, tag);
+}
+EXPORT_SYMBOL_GPL(xas_clear_tag);
+
+/**
+ * xas_init_tags() - Initialise all tags for the entry
+ * @xa: Array
+ * @xas: Array operations state.
+ *
+ * Initialise all tags for the entry specified by @xas. If we're tracking
+ * free entries with a tag, we need to set it on all entries. All other
+ * tags are cleared.
+ *
+ * This implementation is not as efficient as it could be; we may walk
+ * up the tree multiple times.
+ */
+void xas_init_tags(struct xarray *xa, const struct xa_state *xas)
+{
+ xa_tag_t tag = 0;
+
+ if (xa_track_free(xa)) {
+ xas_set_tag(xa, xas, XA_FREE_TAG);
+ inc_tag(tag);
+ }
+ for (;;) {
+ xas_clear_tag(xa, xas, tag);
+ if (tag == XA_TAG_MAX)
+ break;
+ inc_tag(tag);
+ }
+}
+EXPORT_SYMBOL_GPL(xas_init_tags);
+
+/**
+ * xas_load() - Find the entry for the index
+ * @xa: Array.
+ * @xas: Array operation state.
+ *
+ * If the @xas is in an error state, returns the error in the state.
+ * If it is in the RESTART_WALK state, will return XA_WALK_END if the
+ * xa_index cannot be contained in the current xarray without expanding it.
+ * If there is no entry for the index, the walk may stop at a level in the
+ * tree higher than the entry, or even at the root.
+ *
+ * Return: An entry in the tree that is not a sibling or node entry. May be
+ * a NULL pointer, a user pointer, exceptional entry, retry entry, or an
+ * IDR_NULL.
+ */
+void *xas_load(const struct xarray *xa, struct xa_state *xas)
+{
+ void *entry;
+
+ if (xas_error(xas))
+ return xas->xa_node;
+
+ entry = xas_start(xa, xas);
+ while (xa_is_node(entry)) {
+ struct xa_node *node = xa_node(entry);
+
+ if (xas->xa_shift > node->shift)
+ break;
+ entry = xas_descend(xa, xas, node);
+ }
+ return entry;
+}
+EXPORT_SYMBOL_GPL(xas_load);
+
+static void xas_shrink(struct xarray *xa, const struct xa_state *xas)
+{
+ struct xa_node *node = xas->xa_node;
+
+ for (;;) {
+ void *entry;
+
+ XA_BUG_ON(node->count > XA_CHUNK_SIZE);
+ if (node->count != 1)
+ break;
+ entry = xa_entry_locked(xa, node, 0);
+ if (!entry)
+ break;
+ if (!xa_is_node(entry) && node->shift)
+ break;
+
+ RCU_INIT_POINTER(xa->xa_head, entry);
+ if (xa_track_free(xa) && !tag_get(node, 0, XA_FREE_TAG))
+ xa_tag_clear(xa, XA_FREE_TAG);
+
+ node->count = 0;
+ node->exceptional = 0;
+ if (xa_is_node(entry))
+ RCU_INIT_POINTER(node->slots[0], XA_RETRY_ENTRY);
+ VM_WARN_ON_ONCE(!list_empty(&node->private_list));
+ xa_node_free(node);
+ if (!xa_is_node(entry))
+ break;
+ node = xa_node(entry);
+ if (xas->xa_update)
+ xas->xa_update(node);
+ }
+}
+
+void xas_delete_node(struct xarray *xa, struct xa_state *xas)
+{
+ struct xa_node *node = xas->xa_node;
+
+ for (;;) {
+ struct xa_node *parent;
+
+ XA_BUG_ON(node->count > XA_CHUNK_SIZE);
+ if (node->count)
+ break;
+
+ parent = xa_parent_locked(xa, node);
+ VM_WARN_ON_ONCE(!list_empty(&node->private_list));
+ xas->xa_node = parent;
+ xas->xa_offset = node->offset;
+ xa_node_free(node);
+
+ if (!parent) {
+ xa->xa_head = NULL;
+ xas->xa_node = XA_WALK_RESTART;
+ return;
+ }
+
+ parent->slots[xas->xa_offset] = NULL;
+ parent->count--;
+ XA_BUG_ON(parent->count > XA_CHUNK_SIZE);
+ node = parent;
+ if (xas->xa_update)
+ xas->xa_update(node);
+ }
+
+ if (!node->parent)
+ xas_shrink(xa, xas);
+}
+
+/**
+ * xas_free_nodes() - Free this node and all nodes that it references
+ * @xa: Array
+ * @xas: Array operation state
+ * @top: Node to free
+ *
+ * This node has been removed from the tree. We must now free it and all
+ * of its subnodes. There may be RCU walkers with references into the tree,
+ * so we must replace all entries with retry markers.
+ */
+static void xas_free_nodes(const struct xarray *xa, struct xa_state *xas,
+ struct xa_node *top)
+{
+ unsigned int offset = 0;
+ struct xa_node *node = top;
+
+ for (;;) {
+ void *entry = xa_entry_locked(xa, node, offset);
+
+ if (xa_is_node(entry)) {
+ node = xa_node(entry);
+ offset = 0;
+ continue;
+ }
+ if (entry) {
+ RCU_INIT_POINTER(node->slots[offset], XA_RETRY_ENTRY);
+ if (xa_is_exceptional(entry))
+ xas->xa_exceptional--;
+ xas->xa_count--;
+ }
+ offset++;
+ while (offset == XA_CHUNK_SIZE) {
+ struct xa_node *parent = xa_parent_locked(xa, node);
+
+ offset = node->offset + 1;
+ node->count = 0;
+ node->exceptional = 0;
+ if (xas->xa_update)
+ xas->xa_update(node);
+ VM_WARN_ON_ONCE(!list_empty(&node->private_list));
+ xa_node_free(node);
+ if (node == top)
+ return;
+ node = parent;
+ }
+ }
+}
+
+/*
+ * xas_expand adds nodes to the head of the tree until it has reached
+ * sufficient height to be able to contain index
+ */
+static int xas_expand(struct xarray *xa, struct xa_state *xas, void *entry)
+{
+ struct xa_node *node = NULL;
+ unsigned int shift = 0;
+ unsigned long max = xas_max(xas);
+
+ if (!entry) {
+ if (max == 0)
+ return 0;
+ while ((max >> shift) >= XA_CHUNK_SIZE)
+ shift += XA_CHUNK_SHIFT;
+ return shift + XA_CHUNK_SHIFT;
+ } else if (xa_is_node(entry)) {
+ node = xa_node(entry);
+ shift = node->shift + XA_CHUNK_SHIFT;
+ }
+ xas->xa_node = NULL;
+
+ while (!xa_bounds(max, entry)) {
+ xa_tag_t tag = 0;
+
+ XA_BUG_ON(shift > BITS_PER_LONG);
+ node = xas_alloc(xa, xas, shift);
+ if (!node)
+ return -ENOMEM;
+
+ node->count = 1;
+ if (xa_is_exceptional(entry))
+ node->exceptional = 1;
+ RCU_INIT_POINTER(node->slots[0], entry);
+
+ /* Propagate the aggregated tag info to the new child */
+ if (xa_track_free(xa)) {
+ tag_set_all(node, XA_FREE_TAG);
+ if (!xa_tagged(xa, XA_FREE_TAG)) {
+ tag_clear(node, 0, XA_FREE_TAG);
+ xa_tag_set(xa, XA_FREE_TAG);
+ }
+ inc_tag(tag);
+ }
+ for (;;) {
+ if (xa_tagged(xa, tag))
+ tag_set(node, 0, tag);
+ if (tag == XA_TAG_MAX)
+ break;
+ inc_tag(tag);
+ }
+
+ /*
+ * Now that the new node is fully initialised, we can add
+ * it to the tree
+ */
+ if (xa_is_node(entry)) {
+ xa_node(entry)->offset = 0;
+ rcu_assign_pointer(xa_node(entry)->parent, node);
+ }
+ entry = xa_mk_node(node);
+ rcu_assign_pointer(xa->xa_head, entry);
+
+ shift += XA_CHUNK_SHIFT;
+ }
+
+ xas->xa_node = node;
+ return shift;
+}
+
+void *xas_create(struct xarray *xa, struct xa_state *xas)
+{
+ void *entry;
+ void __rcu **slot;
+ struct xa_node *node = xas->xa_node;
+ int shift;
+ unsigned int order = xas->xa_shift;
+
+ if (node == XA_WALK_RESTART) {
+ entry = xa_head_locked(xa);
+ xas->xa_node = NULL;
+ shift = xas_expand(xa, xas, entry);
+ if (shift < 0)
+ goto err;
+ entry = xa_head_locked(xa);
+ slot = &xa->xa_head;
+ } else if (IS_ERR(node)) {
+ return node;
+ } else if (node) {
+ unsigned int offset = xas->xa_offset;
+
+ shift = node->shift;
+ entry = xa_entry_locked(xa, node, offset);
+ slot = &node->slots[offset];
+ } else {
+ shift = 0;
+ entry = xa_head_locked(xa);
+ slot = &xa->xa_head;
+ }
+
+ while (shift > order) {
+ shift -= XA_CHUNK_SHIFT;
+ if (!entry) {
+ node = xas_alloc(xa, xas, shift);
+ if (!node)
+ break;
+ if (xa_track_free(xa))
+ tag_set_all(node, XA_FREE_TAG);
+ rcu_assign_pointer(*slot, xa_mk_node(node));
+ } else if (xa_is_node(entry)) {
+ node = xa_node(entry);
+ } else {
+ break;
+ }
+ entry = xas_descend(xa, xas, node);
+ slot = &node->slots[xas->xa_offset];
+ }
+
+ return entry;
+err:
+ xas_set_err(xas, -ENOMEM);
+ return xas->xa_node;
+}
+EXPORT_SYMBOL_GPL(xas_create);
+
+/* XXX: mishandles counts if you have something like
+ * exceptional, sibling, NULL, normal and store something
+ * over the top of all four. write a testcase for it, then fix it.
+ */
+static void handle_sibling_slots(const struct xarray *xa, struct xa_state *xas,
+ void *entry, int *countp, int *exceptionalp)
+{
+ struct xa_node *node = xas->xa_node;
+ unsigned int offset = xas->xa_offset;
+ unsigned int slots = xas->xa_slots;
+ void *sibling = entry ? xa_mk_sibling(offset) : NULL;
+
+ while (++offset < XA_CHUNK_SIZE) {
+ void *next = xa_entry(xa, node, offset);
+
+ if (--slots)
+ RCU_INIT_POINTER(node->slots[offset], sibling);
+ else if (!xa_is_sibling(next))
+ break;
+
+ if (xa_is_node(next))
+ xas_free_nodes(xa, xas, xa_node(next));
+ *countp += !next - !entry;
+ *exceptionalp += !xa_is_exceptional(next) -
+ !xa_is_exceptional(entry);
+ }
+}
+
+/**
+ * xas_store() - Store this entry in the array
+ * @xa: Array
+ * @xas: State
+ * @entry: New entry
+ *
+ * Return: The old value at this index or ERR_PTR() if an error happened
+ */
+void *xas_store(struct xarray *xa, struct xa_state *xas, void *entry)
+{
+ struct xa_node *node;
+ int count, exceptional;
+ void *curr;
+
+ if (entry)
+ curr = xas_create(xa, xas);
+ else
+ curr = xas_load(xa, xas);
+ if (xas_restart(xas))
+ return NULL;
+
+ node = xas->xa_node;
+ if (IS_ERR(node))
+ return node;
+ if (node)
+ rcu_assign_pointer(node->slots[xas->xa_offset], entry);
+ else
+ rcu_assign_pointer(xa->xa_head, entry);
+ if (!entry)
+ xas_init_tags(xa, xas);
+ else if (xa_track_free(xa))
+ xas_clear_tag(xa, xas, XA_FREE_TAG);
+
+ exceptional = !xa_is_exceptional(curr) - !xa_is_exceptional(entry);
+ count = !curr - !entry;
+ if (xa_is_node(curr))
+ xas_free_nodes(xa, xas, xa_node(curr));
+ if (node)
+ handle_sibling_slots(xa, xas, entry, &count, &exceptional);
+
+ if (!xa_is_internal(entry)) {
+ xas->xa_count += count;
+ xas->xa_exceptional += exceptional;
+ }
+ if (node) {
+ node->count += count;
+ XA_BUG_ON(node->count > XA_CHUNK_SIZE);
+ node->exceptional += exceptional;
+ XA_BUG_ON(node->exceptional > XA_CHUNK_SIZE);
+ if ((count || exceptional) && xas->xa_update)
+ xas->xa_update(node);
+ if (count < 0)
+ xas_delete_node(xa, xas);
+ }
+
+ return curr;
+}
+EXPORT_SYMBOL_GPL(xas_store);
+
+static bool xas_is_sibling(const struct xa_state *xas, void *entry)
+{
+ return (xas->xa_flags & XAS_FLAG_LOOKUP) && xa_is_sibling(entry);
+}
+
+/*
+ * xas_next_entry() - Helper for other tree walking functions
+ *
+ * As a helper, this function has a lot of rough edges. On entry,
+ * xas->xa_index may not lay within xas->xa_node (which is why we
+ * start by walking back up the tree if it isn't). xas->xa_index
+ * is assumed to have been rounded to point to the head of the next entry,
+ * even if the previous iteration hit in the middle of a multislot entry.
+ */
+void *xas_next_entry(const struct xarray *xa, struct xa_state *xas,
+ unsigned long max)
+{
+ XA_BUG_ON(xas_error(xas));
+
+ while (xas->xa_node) {
+ void *entry;
+
+ if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) {
+ xas->xa_offset = xas->xa_node->offset + 1;
+ xas->xa_node = xa_parent(xa, xas->xa_node);
+ continue;
+ }
+ entry = xas_load(xa, xas);
+
+ if (xa_is_node(entry)) {
+ xas->xa_node = xa_node(entry);
+ xas->xa_offset = xas_offset(xas);
+ continue;
+ } else if (xas_is_sibling(xas, entry)) {
+ xas->xa_offset = xa_sibling_offset(entry);
+ entry = xa_entry(xa, xas->xa_node, xas->xa_offset);
+ xas->xa_flags &= ~XAS_FLAG_LOOKUP;
+ return entry;
+ } else if (entry && !xa_is_internal(entry))
+ return entry;
+
+ if (xas->xa_node <= XA_WALK_RESTART)
+ break;
+ xas->xa_offset++;
+ xas->xa_index += 1UL << xas->xa_node->shift;
+ if (xas->xa_index > max)
+ break;
+ }
+
+ return XA_WALK_END;
+}
+EXPORT_SYMBOL_GPL(xas_next_entry);
+
+/**
+ * xas_find_tag() - Search the xarray for a tagged entry
+ * @xa: Array
+ * @xas: Array operation state
+ * @max: Maximum value to return
+ * @tag: Tag number to search for
+ *
+ * Finds the first tagged entry at or after the index in @xas
+ * tag set and is less than or equal to @max.
+ *
+ * Return: The entry, if found, otherwise XA_WALK_END.
+ */
+void *xas_find_tag(const struct xarray *xa, struct xa_state *xas,
+ unsigned long max, xa_tag_t tag)
+{
+ void *entry = XA_WALK_END;
+ int offset;
+
+ XA_BUG_ON(xas_error(xas));
+
+ if (xas->xa_index > max)
+ return entry;
+
+ if (xas_restart(xas)) {
+ struct xa_node *node;
+ unsigned long offset;
+
+ entry = xa_head(xa);
+ if (!xa_tagged(xa, tag)) {
+ if (xa_is_node(entry))
+ xas->xa_index = XA_CHUNK_SIZE <<
+ xa_node(entry)->shift;
+ else if (xas->xa_index < 1)
+ xas->xa_index = 1;
+ return XA_WALK_END;
+ }
+ if (!xa_is_node(entry)) {
+ if (xas->xa_index)
+ return XA_WALK_END;
+ xas->xa_node = NULL;
+ return entry;
+ }
+ node = xa_node(entry);
+ offset = xas->xa_index >> node->shift;
+ if (offset > XA_CHUNK_MASK)
+ return XA_WALK_END;
+ xas->xa_node = node;
+ xas->xa_offset = offset;
+ entry = XA_WALK_END;
+ }
+
+ while (xas->xa_node) {
+ offset = xas_chunk_tag(xas, tag);
+ if (offset != xas->xa_offset) {
+ unsigned long index = set_index(xas->xa_node, offset,
+ xas->xa_index);
+ if (!index || index > max) {
+ xas->xa_index = max + 1;
+ break;
+ }
+ xas->xa_index = index;
+ xas->xa_offset = offset;
+ }
+
+ if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) {
+ xas->xa_offset = xas->xa_node->offset + 1;
+ xas->xa_node = xa_parent(xa, xas->xa_node);
+ continue;
+ }
+
+ entry = xa_entry(xa, xas->xa_node, offset);
+ if (!xa_is_node(entry))
+ break;
+ xas->xa_node = xa_node(entry);
+ xas->xa_offset = xas_offset(xas);
+ entry = XA_WALK_END;
+ }
+
+ if (entry == XA_WALK_END)
+ xas->xa_node = XA_WALK_RESTART;
+ return entry;
+}
+EXPORT_SYMBOL_GPL(xas_find_tag);
+
+/**
+ * xa_load() - Load the entry from the array
+ * @xa: Array
+ * @index: index in array
+ *
+ * Return: The entry at @index in @xa.
+ */
+void *xa_load(const struct xarray *xa, unsigned long index)
+{
+ struct xa_state xas;
+ void *entry;
+
+ xas_init(&xas, index);
+ rcu_read_lock();
+ entry = xas_start(xa, &xas);
+ while (xa_is_node(entry)) {
+ entry = xas_descend(xa, &xas, xa_node(entry));
+ if (xa_is_retry(entry))
+ entry = xas_start(xa, &xas);
+ }
+ rcu_read_unlock();
+
+ if (entry == XA_WALK_END)
+ entry = NULL;
+ return entry;
+}
+EXPORT_SYMBOL(xa_load);
+
+/**
+ * xa_store() - Store this entry in the array
+ * @xa: Array
+ * @index: index in array
+ * @entry: New entry
+ * @gfp: Allocation flags
+ *
+ * Stores almost always succeed. The notable exceptions:
+ * - Attempted to store a reserved pointer value (-EINVAL)
+ * - Ran out of memory trying to allocate new nodes (-ENOMEM)
+ *
+ * Storing into an existing multientry updates the value of every index.
+ *
+ * Return: The old value at this index or ERR_PTR() if an error happened
+ */
+void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
+{
+ struct xa_state xas;
+ unsigned long flags;
+ void *curr;
+
+ if (WARN_ON_ONCE(xa_is_internal(entry)))
+ return ERR_PTR(-EINVAL);
+
+ xas_init(&xas, index);
+ do {
+ xa_lock_irqsave(xa, flags);
+ curr = xas_store(xa, &xas, entry);
+ xa_unlock_irqrestore(xa, flags);
+ } while (xas_nomem(&xas, gfp));
+ xas_destroy(&xas);
+
+ if (xas_error(&xas))
+ curr = xas.xa_node;
+ return curr;
+}
+EXPORT_SYMBOL(xa_store);
+
+/**
+ * xa_replace() - Conditionally replace this entry in the array
+ * @xa: Array
+ * @index: index in array
+ * @entry: New value to place in array
+ * @old: Old value to test against
+ * @gfp: Allocation flags
+ *
+ * If the entry at @index is the same as @old, replace it with @entry.
+ * If the return value is equal to @old, then the exchange was successful.
+ *
+ * Return: The old value at this index or ERR_PTR() if an error happened
+ */
+void *xa_replace(struct xarray *xa, unsigned long index,
+ void *entry, void *old, gfp_t gfp)
+{
+ struct xa_state xas;
+ unsigned long flags;
+ void *curr;
+
+ if (WARN_ON_ONCE(xa_is_internal(entry)))
+ return ERR_PTR(-EINVAL);
+
+ xas_init(&xas, index);
+ do {
+ xa_lock_irqsave(xa, flags);
+ curr = xas_create(xa, &xas);
+ if (curr == old)
+ xas_store(xa, &xas, entry);
+ xa_unlock_irqrestore(xa, flags);
+ } while (xas_nomem(&xas, gfp));
+ xas_destroy(&xas);
+
+ if (xas_error(&xas))
+ curr = xas.xa_node;
+ return curr;
+}
+EXPORT_SYMBOL(xa_replace);
+
+/**
+ * xa_get_tag() - Inquire whether this tag is set on this entry
+ * @xa: Array
+ * @index: Index of entry
+ * @tag: Tag value
+ *
+ * This function is protected by the RCU read lock, so the result may be
+ * out of date by the time it returns. If you need the result to be stable,
+ * use a lock.
+ *
+ * Return: True if the entry at @index has this tag set, false if it doesn't.
+ */
+bool __xa_get_tag(const struct xarray *xa, unsigned long index, xa_tag_t tag)
+{
+ struct xa_state xas;
+ void *entry;
+
+ xas_init(&xas, index);
+ rcu_read_lock();
+ entry = xas_start(xa, &xas);
+ while (xas_get_tag(xa, &xas, tag)) {
+ if (!xa_is_node(entry))
+ goto found;
+ entry = xas_descend(xa, &xas, xa_node(entry));
+ }
+ rcu_read_unlock();
+ return false;
+found:
+ rcu_read_unlock();
+ return true;
+}
+EXPORT_SYMBOL(__xa_get_tag);
+
+/**
+ * xa_set_tag() - Set this tag on this entry.
+ * @xa: Array
+ * @index: Index of entry
+ * @tag: Tag value
+ *
+ * Attempting to set a tag on a NULL entry does not succeed.
+ *
+ * Return: The entry at this index or ERR_PTR() if an error occurs.
+ */
+void *__xa_set_tag(struct xarray *xa, unsigned long index, xa_tag_t tag)
+{
+ struct xa_state xas;
+ unsigned long flags;
+ void *entry;
+
+ xas_init(&xas, index);
+ xa_lock_irqsave(xa, flags);
+ entry = xas_load(xa, &xas);
+ if (entry == XA_WALK_END)
+ entry = NULL;
+ if (entry)
+ xas_set_tag(xa, &xas, tag);
+ xa_unlock_irqrestore(xa, flags);
+
+ return entry;
+}
+EXPORT_SYMBOL(__xa_set_tag);
+
+/**
+ * xa_clear_tag() - Clear this tag on this entry.
+ * @xa: Array
+ * @index: Index of entry
+ * @tag: Tag value
+ *
+ * Clearing a tag on an entry which doesn't exist always succeeds
+ *
+ * Return: The entry at this index or ERR_PTR() if an error occurs.
+ */
+void *__xa_clear_tag(struct xarray *xa, unsigned long index, xa_tag_t tag)
+{
+ struct xa_state xas;
+ unsigned long flags;
+ void *entry;
+
+ xas_init(&xas, index);
+ xa_lock_irqsave(xa, flags);
+ entry = xas_load(xa, &xas);
+ if (entry == XA_WALK_END)
+ entry = NULL;
+ if (entry)
+ xas_clear_tag(xa, &xas, tag);
+ xa_unlock_irqrestore(xa, flags);
+
+ return entry;
+}
+EXPORT_SYMBOL(__xa_clear_tag);
+
+/**
+ * xa_find() - Search the xarray for a present entry
+ * @xa: Array
+ * @indexp: Pointer to an index
+ * @max: Maximum value to return
+ *
+ * Finds the entry in the xarray with the lowest index that is between
+ * *@...exp and max, inclusive. If an entry is found, updates @indexp to
+ * be the index of the pointer. This function is protected by the RCU read
+ * lock, so it may not find all entries if called in a loop.
+ *
+ * Return: The pointer, if found, otherwise NULL.
+ */
+void *xa_find(const struct xarray *xa, unsigned long *indexp, unsigned long max)
+{
+ struct xa_state xas;
+ void *entry;
+
+ xas_init(&xas, *indexp);
+
+ rcu_read_lock();
+ do {
+ entry = xas_next(xa, &xas, max);
+ if (xa_is_retry(entry))
+ xas_retry(&xas);
+ if (entry == XA_WALK_END)
+ entry = NULL;
+ } while (xa_is_internal(entry));
+ rcu_read_unlock();
+
+ if (entry)
+ *indexp = xas.xa_index;
+ return entry;
+}
+EXPORT_SYMBOL(xa_find);
+
+/**
+ * xa_next() - Search the xarray for the next present entry
+ * @xa: Array
+ * @indexp: Pointer to an index
+ * @max: Maximum value to return
+ *
+ * Finds the entry in the xarray with the lowest index that is above
+ * *@...exp and not greater than max. If an entry is found, updates
+ * @indexp to be the index of the pointer.
+ *
+ * Return: The pointer, if found, otherwise NULL.
+ */
+void *xa_next(const struct xarray *xa, unsigned long *indexp, unsigned long max)
+{
+ struct xa_state xas;
+ void *entry;
+
+ xas_init(&xas, *indexp + 1);
+ xas.xa_flags &= ~XAS_FLAG_LOOKUP;
+
+ rcu_read_lock();
+ do {
+ entry = xas_next(xa, &xas, max);
+ if (xa_is_retry(entry))
+ xas_retry(&xas);
+ if (entry == XA_WALK_END)
+ entry = NULL;
+ } while (xa_is_internal(entry));
+ rcu_read_unlock();
+
+ if (entry)
+ *indexp = xas.xa_index;
+ return entry;
+}
+EXPORT_SYMBOL(xa_next);
+
+/**
+ * xa_get_entries() - Copy entries from the xarray into a normal array
+ * @xa: The source XArray to copy from
+ * @dst: The buffer to copy pointers into
+ * @start: The first index in the XArray eligible to be copied from
+ * @n: The maximum number of entries to copy
+ *
+ * Return: The number of entries copied.
+ */
+int xa_get_entries(const struct xarray *xa, unsigned long start, void **dst,
+ unsigned int n)
+{
+ struct xa_state xas;
+ void *entry;
+ unsigned int i = 0;
+
+ if (!n)
+ return 0;
+
+ xas_init(&xas, start);
+ rcu_read_lock();
+ xas_for_each(xa, &xas, entry, ~0UL) {
+ if (xa_is_retry(entry)) {
+ xas_retry(&xas);
+ continue;
+ }
+ dst[i++] = entry;
+ if (i == n)
+ break;
+ }
+ rcu_read_unlock();
+
+ return i;
+}
+EXPORT_SYMBOL(xa_get_entries);
+
+/**
+ * xa_get_tagged() - Copy tagged entries from the xarray into a normal array
+ * @xa: The source XArray to copy from
+ * @dst: The buffer to copy pointers into
+ * @start: The first index in the XArray eligible to be copied from
+ * @n: The maximum number of entries to copy
+ *
+ * Return: The number of entries copied.
+ */
+int xa_get_tagged(const struct xarray *xa, unsigned long start, void **dst,
+ unsigned int n, xa_tag_t tag)
+{
+ struct xa_state xas;
+ void *entry;
+ unsigned int i = 0;
+
+ if (!n)
+ return 0;
+
+ xas_init(&xas, start);
+ rcu_read_lock();
+ xas_for_each_tag(xa, &xas, entry, ~0UL) {
+ if (xa_is_retry(entry)) {
+ xas_retry(&xas);
+ continue;
+ }
+ dst[i++] = entry;
+ if (i == n)
+ break;
+ }
+ rcu_read_unlock();
+
+ return i;
+}
+EXPORT_SYMBOL(xa_get_tagged);
+
+void __init xarray_init(void)
+{
+ xa_node_cache = kmem_cache_create("xarray node",
+ sizeof(struct xa_node), 0,
+ SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
+ xa_node_ctor);
+}
--
2.11.0
Powered by blists - more mailing lists