lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6433156f-34a8-400f-e282-91268b242279@bytedance.com>
Date:   Thu, 17 Nov 2022 15:19:20 +0800
From:   Zhongkun He <hezhongkun.hzk@...edance.com>
To:     Michal Hocko <mhocko@...e.com>
Cc:     Andrew Morton <akpm@...ux-foundation.org>, corbet@....net,
        linux-mm@...ck.org, linux-kernel@...r.kernel.org,
        linux-api@...r.kernel.org, linux-doc@...r.kernel.org
Subject: Re: [External] Re: [PATCH v2] mm: add new syscall
 pidfd_set_mempolicy().

Hi Michal, thanks for your replay.

> 
> It would be better to add the patch that has been tested.

OK.

> 
> One way to deal with that would be to use a similar model as css_tryget

Percpu_ref is a good way to  reduce memory footprint in fast path.But it
has the potential to make mempolicy heavy. the sizeof mempolicy is 32
bytes and it may not have a long life time, which duplicated from the
parent in fork().If we modify atomic_t to percpu_ref, the efficiency of
reading in fastpath will increase, the efficiency of creation and
deletion will decrease, and the occupied space will increase
significantly.I am not really sure it is worth it.

atomic_t; 4
sizeof(percpu_ref + percpu_ref_data + cpus* unsigned long)
16+56+cpus*8

> 
> Btw. have you tried to profile those slowdowns to identify hotspots?
> 
> Thanks

Yes, it will degrade performance about 2%-%3 may because of the 
task_lock and  atomic operations on the reference count as shown
in the previous email.

new hotspots in perf.
1.34%  [kernel]          [k] __mpol_put
0.53%  [kernel]          [k] _raw_spin_lock
0.44%  [kernel]          [k] get_task_policy


Tested patch.

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8a74cdcc9af0..3f1b5c8329a8 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -105,10 +105,7 @@ static void hold_task_mempolicy(struct 
proc_maps_private *priv)
  {
         struct task_struct *task = priv->task;

-       task_lock(task);
         priv->task_mempolicy = get_task_policy(task);
-       mpol_get(priv->task_mempolicy);
-       task_unlock(task);
  }
  static void release_task_mempolicy(struct proc_maps_private *priv)
  {
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index d232de7cdc56..786481d7abfd 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -62,7 +62,7 @@ struct mempolicy {
  extern void __mpol_put(struct mempolicy *pol);
  static inline void mpol_put(struct mempolicy *pol)
  {
-       if (pol)
+       if (pol && !(pol->flags & MPOL_F_STATIC))
                 __mpol_put(pol);
  }

diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 046d0ccba4cd..7c2068163a0c 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -63,7 +63,7 @@ enum {
  #define MPOL_F_SHARED  (1 << 0)        /* identify shared policies */
  #define MPOL_F_MOF     (1 << 3) /* this policy wants migrate on fault */
  #define MPOL_F_MORON   (1 << 4) /* Migrate On protnone Reference On 
Node */
-
+#define MPOL_F_STATIC (1 << 5)
  /*
   * These bit locations are exposed in the vm.zone_reclaim_mode sysctl
   * ABI.  New bits are OK, but existing bits can never change.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 546df97c31e4..4cca96a40d04 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1247,6 +1247,7 @@ static struct page *dequeue_huge_page_vma(struct 
hstate *h,
         }

         mpol_cond_put(mpol);
+       mpol_put(mpol);
         return page;

  err:
@@ -2316,6 +2317,7 @@ struct page 
*alloc_buddy_huge_page_with_mpol(struct hstate *h,
         if (!page)
                 page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
         mpol_cond_put(mpol);
+       mpol_put(mpol);
         return page;
  }

@@ -2352,6 +2354,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, 
struct vm_area_struct *vma,
         node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
         page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
         mpol_cond_put(mpol);
+       mpol_put(mpol);

         return page;
  }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 61aa9aedb728..ea670db6881f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -126,6 +126,7 @@ enum zone_type policy_zone = 0;
  static struct mempolicy default_policy = {
         .refcnt = ATOMIC_INIT(1), /* never free it */
         .mode = MPOL_LOCAL,
+       .flags = MPOL_F_STATIC
  };

  static struct mempolicy preferred_node_policy[MAX_NUMNODES];
@@ -160,11 +161,19 @@ EXPORT_SYMBOL_GPL(numa_map_to_online_node);

  struct mempolicy *get_task_policy(struct task_struct *p)
  {
-       struct mempolicy *pol = p->mempolicy;
+       struct mempolicy *pol;
         int node;

-       if (pol)
-               return pol;
+       if (p->mempolicy)
+       {
+               task_lock(p);
+               pol = p->mempolicy;
+               mpol_get(pol);
+               task_unlock(p);
+
+               if(pol)
+                       return pol;
+       }

         node = numa_node_id();
         if (node != NUMA_NO_NODE) {
@@ -1764,10 +1773,12 @@ struct mempolicy *__get_vma_policy(struct 
vm_area_struct *vma,
                          * a pseudo vma whose vma->vm_ops=NULL. Take a 
reference
                          * count on these policies which will be dropped by
                          * mpol_cond_put() later
+                        *
+                        * if (mpol_needs_cond_ref(pol))
+                        *      mpol_get(pol);
                          */
-                       if (mpol_needs_cond_ref(pol))
-                               mpol_get(pol);
                 }
+               mpol_get(pol);
         }

         return pol;
@@ -1799,9 +1810,9 @@ static struct mempolicy *get_vma_policy(struct 
vm_area_struct *vma,
  bool vma_policy_mof(struct vm_area_struct *vma)
  {
         struct mempolicy *pol;
+       bool ret = false;

         if (vma->vm_ops && vma->vm_ops->get_policy) {
-               bool ret = false;

                 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
                 if (pol && (pol->flags & MPOL_F_MOF))
@@ -1812,10 +1823,13 @@ bool vma_policy_mof(struct vm_area_struct *vma)
         }

         pol = vma->vm_policy;
+       mpol_get(pol);
         if (!pol)
                 pol = get_task_policy(current);
+       ret = pol && (pol->flags & MPOL_F_MOF);
+       mpol_put(pol);

-       return pol->flags & MPOL_F_MOF;
+       return ret;
  }

  bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
@@ -2179,7 +2193,6 @@ struct folio *vma_alloc_folio(gfp_t gfp, int 
order, struct vm_area_struct *vma,
                 unsigned nid;

                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
-               mpol_cond_put(pol);
                 gfp |= __GFP_COMP;
                 page = alloc_page_interleave(gfp, order, nid);
                 if (page && order > 1)
@@ -2194,7 +2207,6 @@ struct folio *vma_alloc_folio(gfp_t gfp, int 
order, struct vm_area_struct *vma,
                 node = policy_node(gfp, pol, node);
                 gfp |= __GFP_COMP;
                 page = alloc_pages_preferred_many(gfp, order, node, pol);
-               mpol_cond_put(pol);
                 if (page && order > 1)
                         prep_transhuge_page(page);
                 folio = (struct folio *)page;
@@ -2219,7 +2231,6 @@ struct folio *vma_alloc_folio(gfp_t gfp, int 
order, struct vm_area_struct *vma,

                 nmask = policy_nodemask(gfp, pol);
                 if (!nmask || node_isset(hpage_node, *nmask)) {
-                       mpol_cond_put(pol);
                         /*
                          * First, try to allocate THP only on local 
node, but
                          * don't reclaim unnecessarily, just compact.
@@ -2244,8 +2255,9 @@ struct folio *vma_alloc_folio(gfp_t gfp, int 
order, struct vm_area_struct *vma,
         nmask = policy_nodemask(gfp, pol);
         preferred_nid = policy_node(gfp, pol, node);
         folio = __folio_alloc(gfp, order, preferred_nid, nmask);
-       mpol_cond_put(pol);
  out:
+       mpol_cond_put(pol);
+       mpol_put(pol);
         return folio;
  }
  EXPORT_SYMBOL(vma_alloc_folio);
@@ -2286,6 +2298,7 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
                                 policy_node(gfp, pol, numa_node_id()),
                                 policy_nodemask(gfp, pol));

+       mpol_put(pol);
         return page;
  }
  EXPORT_SYMBOL(alloc_pages);
@@ -2365,21 +2378,23 @@ unsigned long 
alloc_pages_bulk_array_mempolicy(gfp_t gfp,
                 unsigned long nr_pages, struct page **page_array)
  {
         struct mempolicy *pol = &default_policy;
+       unsigned long allocated;

         if (!in_interrupt() && !(gfp & __GFP_THISNODE))
                 pol = get_task_policy(current);

-       if (pol->mode == MPOL_INTERLEAVE)
-               return alloc_pages_bulk_array_interleave(gfp, pol,
+       if (pol->mode == MPOL_INTERLEAVE) {
+               allocated =  alloc_pages_bulk_array_interleave(gfp, pol,
                                                          nr_pages, 
page_array);
-
-       if (pol->mode == MPOL_PREFERRED_MANY)
-               return alloc_pages_bulk_array_preferred_many(gfp,
+       } else if (pol->mode == MPOL_PREFERRED_MANY)
+               allocated = alloc_pages_bulk_array_preferred_many(gfp,
                                 numa_node_id(), pol, nr_pages, page_array);
-
-       return __alloc_pages_bulk(gfp, policy_node(gfp, pol, 
numa_node_id()),
+       else
+              allocated = __alloc_pages_bulk(gfp, policy_node(gfp, pol, 
numa_node_id()),
                                   policy_nodemask(gfp, pol), nr_pages, 
NULL,
                                   page_array);
+       mpol_put(pol);
+       return allocated;
  }

  int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
@@ -2636,6 +2651,7 @@ int mpol_misplaced(struct page *page, struct 
vm_area_struct *vma, unsigned long
                 ret = polnid;
  out:
         mpol_cond_put(pol);
+       mpol_put(pol);

         return ret;
  }
@@ -2917,7 +2933,7 @@ void __init numa_policy_init(void)
                 preferred_node_policy[nid] = (struct mempolicy) {
                         .refcnt = ATOMIC_INIT(1),
                         .mode = MPOL_PREFERRED,
-                       .flags = MPOL_F_MOF | MPOL_F_MORON,
+                       .flags = MPOL_F_MOF | MPOL_F_MORON | MPOL_F_STATIC,
                         .nodes = nodemask_of_node(nid),
                 };
         }

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ