[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1213298105.31518.170.camel@twins>
Date:	Thu, 12 Jun 2008 21:15:05 +0200
From:	Peter Zijlstra <peterz@...radead.org>
To:	Nick Piggin <nickpiggin@...oo.com.au>
Cc:	akpm@...ux-foundation.org, linux-kernel@...r.kernel.org,
	paulmck@...ibm.com
Subject: Re: [patch] radix-tree: fix small lockless radix-tree bug
On Fri, 2008-06-13 at 05:03 +1000, Nick Piggin wrote:
> Hi guys,
> 
> Although this doesn't seem like cause for alarm (as per the analysis),
> it may still be a good 2.6.26 candidate as we should have a few more
> weeks of testing left.
> 
> It should definitely go in -mm with the lockless pagecache patch.
Ouch - good one, I'll back-port it to -rt.
This reminds me, I should get back to my radix-tree path compression
stuff one day.
> When shrinking a radix-tree, we do it in a lockless manner by atomically
> switching the root pointer away from the redundant node (one that only
> has a single entry in the left most slot), and switching it over to its
> lone child.
> 
> Because a lockless lookup may have got a reference to the parent and be
> in the middle of deciding what to do with it while it is being swapped
> away for its child. For this reason, we also have to keep it around and
> in a valid state for the lookup to proceed and give a valid result, for
> at least an RCU grace period. So we need to keep the child in the left
> most slot there in case that is requested by the lookup.
> 
> This is all pretty standard RCU stuff. It is worth repeating because
> in my eagerness to obey the radix tree node constructor scheme, I had
> broken this by zeroing the radix tree node before the grace period.
> 
> Fix it by clearing those fields in the RCU callback. I would normally
> want to rip out the constructor entirely, but radix tree nodes are one
> of those places where they make sense (only few cachelines will be
> touched soon after allocation).
> 
> 
> This was never actually observed in any lockless pagecache testing or
> using the test harness, but as a rare problem testing my scalable vmap
> rewrite.
> 
> Fortunately, it is not a problem anywhere lockless pagecache is used in
> mainline kernels (pagecache probe is not a guarantee, and brd does not
> have concurrent lookups and deletes).
> 
> However, it would eventually pop up for someone using lockless pagecache :P
> 
> Signed-off-by: Nick Piggin <npiggin@...e.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@...llo.nl>
> ---
> Index: linux-2.6/lib/radix-tree.c
> ===================================================================
> --- linux-2.6.orig/lib/radix-tree.c     2008-06-13 04:26:31.000000000
> +1000
> +++ linux-2.6/lib/radix-tree.c  2008-06-13 04:31:38.000000000 +1000
> @@ -88,6 +88,57 @@ static inline gfp_t root_gfp_mask(struct
>         return root->gfp_mask & __GFP_BITS_MASK;
>  }
>  
> +static inline void tag_set(struct radix_tree_node *node, unsigned int
> tag,
> +               int offset)
> +{
> +       __set_bit(offset, node->tags[tag]);
> +}
> +
> +static inline void tag_clear(struct radix_tree_node *node, unsigned
> int tag,
> +               int offset)
> +{
> +       __clear_bit(offset, node->tags[tag]);
> +}
> +
> +static inline int tag_get(struct radix_tree_node *node, unsigned int
> tag,
> +               int offset)
> +{
> +       return test_bit(offset, node->tags[tag]);
> +}
> +
> +static inline void root_tag_set(struct radix_tree_root *root,
> unsigned int tag)
> +{
> +       root->gfp_mask |= (__force gfp_t)(1 << (tag +
> __GFP_BITS_SHIFT));
> +}
> +
> +static inline void root_tag_clear(struct radix_tree_root *root,
> unsigned int tag)
> +{
> +       root->gfp_mask &= (__force gfp_t)~(1 << (tag +
> __GFP_BITS_SHIFT));
> +}
> +
> +static inline void root_tag_clear_all(struct radix_tree_root *root)
> +{
> +       root->gfp_mask &= __GFP_BITS_MASK;
> +}
> +
> +static inline int root_tag_get(struct radix_tree_root *root, unsigned
> int tag)
> +{
> +       return (__force unsigned)root->gfp_mask & (1 << (tag +
> __GFP_BITS_SHIFT));
> +}
> +
> +/*
> + * Returns 1 if any slot in the node has this tag set.
> + * Otherwise returns 0.
> + */
> +static inline int any_tag_set(struct radix_tree_node *node, unsigned
> int tag)
> +{
> +       int idx;
> +       for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
> +               if (node->tags[tag][idx])
> +                       return 1;
> +       }
> +       return 0;
> +}
>  /*
>   * This assumes that the caller has performed appropriate
> preallocation, and
>   * that the caller has pinned this thread of control to the current
> CPU.
> @@ -124,6 +175,17 @@ static void radix_tree_node_rcu_free(str
>  {
>         struct radix_tree_node *node =
>                         container_of(head, struct radix_tree_node,
> rcu_head);
> +
> +       /*
> +        * must only free zeroed nodes into the slab.
> radix_tree_shrink
> +        * can leave us with a non-NULL entry in the first slot, so
> clear
> +        * that here to make sure.
> +        */
> +       tag_clear(node, 0, 0);
> +       tag_clear(node, 1, 0);
> +       node->slots[0] = NULL;
> +       node->count = 0;
> +
>         kmem_cache_free(radix_tree_node_cachep, node);
>  }
>  
> @@ -165,59 +227,6 @@ out:
>  }
>  EXPORT_SYMBOL(radix_tree_preload);
>  
> -static inline void tag_set(struct radix_tree_node *node, unsigned int
> tag,
> -               int offset)
> -{
> -       __set_bit(offset, node->tags[tag]);
> -}
> -
> -static inline void tag_clear(struct radix_tree_node *node, unsigned
> int tag,
> -               int offset)
> -{
> -       __clear_bit(offset, node->tags[tag]);
> -}
> -
> -static inline int tag_get(struct radix_tree_node *node, unsigned int
> tag,
> -               int offset)
> -{
> -       return test_bit(offset, node->tags[tag]);
> -}
> -
> -static inline void root_tag_set(struct radix_tree_root *root,
> unsigned int tag)
> -{
> -       root->gfp_mask |= (__force gfp_t)(1 << (tag +
> __GFP_BITS_SHIFT));
> -}
> -
> -
> -static inline void root_tag_clear(struct radix_tree_root *root,
> unsigned int tag)
> -{
> -       root->gfp_mask &= (__force gfp_t)~(1 << (tag +
> __GFP_BITS_SHIFT));
> -}
> -
> -static inline void root_tag_clear_all(struct radix_tree_root *root)
> -{
> -       root->gfp_mask &= __GFP_BITS_MASK;
> -}
> -
> -static inline int root_tag_get(struct radix_tree_root *root, unsigned
> int tag)
> -{
> -       return (__force unsigned)root->gfp_mask & (1 << (tag +
> __GFP_BITS_SHIFT));
> -}
> -
> -/*
> - * Returns 1 if any slot in the node has this tag set.
> - * Otherwise returns 0.
> - */
> -static inline int any_tag_set(struct radix_tree_node *node, unsigned
> int tag)
> -{
> -       int idx;
> -       for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
> -               if (node->tags[tag][idx])
> -                       return 1;
> -       }
> -       return 0;
> -}
> -
>  /*
>   *     Return the maximum key which can be store into a
>   *     radix tree with height HEIGHT.
> @@ -930,11 +939,6 @@ static inline void radix_tree_shrink(str
>                         newptr = radix_tree_ptr_to_indirect(newptr);
>                 root->rnode = newptr;
>                 root->height--;
> -               /* must only free zeroed nodes into the slab */
> -               tag_clear(to_free, 0, 0);
> -               tag_clear(to_free, 1, 0);
> -               to_free->slots[0] = NULL;
> -               to_free->count = 0;
>                 radix_tree_node_free(to_free);
>         }
>  }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Powered by blists - more mailing lists
 
