linux-kernel - Re: [RFC PATCH] mm/workingset : judge file page activity via timestamp

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CAGWkznEqhHcAb0ZnO9-ssk1qQHYFKx4ML0vd4Knj_f2n_PpR0g@mail.gmail.com>
Date:   Tue, 23 Apr 2019 19:43:18 +0800
From:   Zhaoyang Huang <huangzhaoyang@...il.com>
To:     Matthew Wilcox <willy@...radead.org>
Cc:     Michal Hocko <mhocko@...nel.org>,
        Andrew Morton <akpm@...ux-foundation.org>,
        Vlastimil Babka <vbabka@...e.cz>,
        Joonsoo Kim <iamjoonsoo.kim@....com>,
        David Rientjes <rientjes@...gle.com>,
        Zhaoyang Huang <zhaoyang.huang@...soc.com>,
        Roman Gushchin <guro@...com>, Jeff Layton <jlayton@...hat.com>,
        "open list:MEMORY MANAGEMENT" <linux-mm@...ck.org>,
        LKML <linux-kernel@...r.kernel.org>,
        Pavel Tatashin <pasha.tatashin@...een.com>,
        Johannes Weiner <hannes@...xchg.org>
Subject: Re: [RFC PATCH] mm/workingset : judge file page activity via timestamp

rebase the commit to latest mainline and update the code.
@Matthew, with regarding to your comment, I would like to say the
algorithm doesn't change at all. I do NOT judge the page's activity
via an absolute time value, but still the refault distance. What I
want to fix is the scenario which drop lots of file pages on this lru
that leading to a big refault_distance(inactive_age) and inactivate
the page. I haven't found regression of the commit yet. Could you
please suggest me more test cases? Thank you!
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fba7741..ca4ced6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -242,6 +242,7 @@ struct lruvec {
  atomic_long_t inactive_age;
  /* Refaults at the time of last reclaim cycle */
  unsigned long refaults;
+ atomic_long_t refaults_ratio;
 #ifdef CONFIG_MEMCG
  struct pglist_data *pgdat;
 #endif
diff --git a/mm/workingset.c b/mm/workingset.c
index 0bedf67..95683c1 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -171,6 +171,15 @@
  1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
 #define EVICTION_MASK (~0UL >> EVICTION_SHIFT)

+#ifdef CONFIG_64BIT
+#define EVICTION_SECS_POS_SHIFT 19
+#define EVICTION_SECS_SHRINK_SHIFT 4
+#define EVICTION_SECS_POS_MASK  ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
+#else
+#define EVICTION_SECS_POS_SHIFT 0
+#define EVICTION_SECS_SHRINK_SHIFT 0
+#define NO_SECS_IN_WORKINGSET
+#endif
 /*
  * Eviction timestamps need to be able to cover the full range of
  * actionable refaults. However, bits are tight in the xarray
@@ -180,12 +189,48 @@
  * evictions into coarser buckets by shaving off lower timestamp bits.
  */
 static unsigned int bucket_order __read_mostly;
-
+#ifdef NO_SECS_IN_WORKINGSET
+static void pack_secs(unsigned long *peviction) { }
+static unsigned int unpack_secs(unsigned long entry) {return 0; }
+#else
+static void pack_secs(unsigned long *peviction)
+{
+ unsigned int secs;
+ unsigned long eviction;
+ int order;
+ int secs_shrink_size;
+ struct timespec64 ts;
+
+ ktime_get_boottime_ts64(&ts);
+ secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1;
+ order = get_count_order(secs);
+ secs_shrink_size = (order <= EVICTION_SECS_POS_SHIFT)
+ ? 0 : (order - EVICTION_SECS_POS_SHIFT);
+
+ eviction = *peviction;
+ eviction = (eviction << EVICTION_SECS_POS_SHIFT)
+ | ((secs >> secs_shrink_size) & EVICTION_SECS_POS_MASK);
+ eviction = (eviction << EVICTION_SECS_SHRINK_SHIFT) |
(secs_shrink_size & 0xf);
+ *peviction = eviction;
+}
+static unsigned int unpack_secs(unsigned long entry)
+{
+ unsigned int secs;
+ int secs_shrink_size;
+
+ secs_shrink_size = entry & ((1 << EVICTION_SECS_SHRINK_SHIFT) - 1);
+ entry >>= EVICTION_SECS_SHRINK_SHIFT;
+ secs = entry & EVICTION_SECS_POS_MASK;
+ secs = secs << secs_shrink_size;
+ return secs;
+}
+#endif
 static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
  bool workingset)
 {
  eviction >>= bucket_order;
  eviction &= EVICTION_MASK;
+ pack_secs(&eviction);
  eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
  eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
  eviction = (eviction << 1) | workingset;
@@ -194,11 +239,12 @@ static void *pack_shadow(int memcgid, pg_data_t
*pgdat, unsigned long eviction,
 }

 static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
-   unsigned long *evictionp, bool *workingsetp)
+ unsigned long *evictionp, bool *workingsetp, unsigned int *prev_secs)
 {
  unsigned long entry = xa_to_value(shadow);
  int memcgid, nid;
  bool workingset;
+ unsigned int secs;

  workingset = entry & 1;
  entry >>= 1;
@@ -206,11 +252,14 @@ static void unpack_shadow(void *shadow, int
*memcgidp, pg_data_t **pgdat,
  entry >>= NODES_SHIFT;
  memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
  entry >>= MEM_CGROUP_ID_SHIFT;
+ secs = unpack_secs(entry);
+ entry >>= (EVICTION_SECS_POS_SHIFT + EVICTION_SECS_SHRINK_SHIFT);

  *memcgidp = memcgid;
  *pgdat = NODE_DATA(nid);
  *evictionp = entry << bucket_order;
  *workingsetp = workingset;
+ *prev_secs = secs;
 }

 /**
@@ -257,8 +306,22 @@ void workingset_refault(struct page *page, void *shadow)
  unsigned long refault;
  bool workingset;
  int memcgid;
+#ifndef NO_SECS_IN_WORKINGSET
+ unsigned long avg_refault_time;
+ unsigned long refaults_ratio;
+ unsigned long refault_time;
+ int tradition;
+ unsigned int prev_secs;
+ unsigned int secs;
+#endif
+ struct timespec64 ts;
+ /*
+ convert jiffies to second
+ */
+ ktime_get_boottime_ts64(&ts);
+ secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1;

- unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
+ unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset, &prev_secs);

  rcu_read_lock();
  /*
@@ -303,23 +366,58 @@ void workingset_refault(struct page *page, void *shadow)
  refault_distance = (refault - eviction) & EVICTION_MASK;

  inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
-
+#ifndef NO_SECS_IN_WORKINGSET
+ refaults_ratio = (atomic_long_read(&lruvec->inactive_age) + 1) / secs;
+ atomic_long_set(&lruvec->refaults_ratio, refaults_ratio);
+ refault_time = secs - prev_secs;
+ avg_refault_time = active_file / refaults_ratio;
+ tradition = !!(refault_distance < active_file);
  /*
- * Compare the distance to the existing workingset size. We
- * don't act on pages that couldn't stay resident even if all
- * the memory was available to the page cache.
+ * What we are trying to solve here is
+ * 1. extremely fast refault as refault_time == 0.
+ * 2. quick file drop scenario, which has a big refault_distance but
+ *    small refault_time comparing with the past refault ratio, which
+ *    will be deemed as inactive in previous implementation.
  */
- if (refault_distance > active_file)
+ if (refault_time && (((refault_time < avg_refault_time)
+ && (avg_refault_time < 2 * refault_time))
+ || (refault_time >= avg_refault_time))) {
+ trace_printk("WKST_INACT[%d]:rft_dis %ld, act %ld\
+ rft_ratio %ld rft_time %ld avg_rft_time %ld\
+ refault %ld eviction %ld secs %d pre_secs %d page %p\n",
+ tradition, refault_distance, active_file,
+ refaults_ratio, refault_time, avg_refault_time,
+ refault, eviction, secs, prev_secs, page);
  goto out;
+ }
+ else {
+#else
+ if (refault_distance < active_file) {
+#endif

- SetPageActive(page);
- atomic_long_inc(&lruvec->inactive_age);
- inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
+ /*
+ * Compare the distance to the existing workingset size. We
+ * don't act on pages that couldn't stay resident even if all
+ * the memory was available to the page cache.
+ */

- /* Page was active prior to eviction */
- if (workingset) {
- SetPageWorkingset(page);
- inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
+ SetPageActive(page);
+ atomic_long_inc(&lruvec->inactive_age);
+ inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
+
+ /* Page was active prior to eviction */
+ if (workingset) {
+ SetPageWorkingset(page);
+ inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
+ }
+#ifndef NO_SECS_IN_WORKINGSET
+ trace_printk("WKST_ACT[%d]:rft_dis %ld, act %ld\
+ rft_ratio %ld rft_time %ld avg_rft_time %ld\
+ refault %ld eviction %ld secs %d pre_secs %d page %p\n",
+ tradition, refault_distance, active_file,
+ refaults_ratio, refault_time, avg_refault_time,
+ refault, eviction, secs, prev_secs, page);
+#endif
  }
 out:
  rcu_read_unlock();
@@ -539,7 +637,9 @@ static int __init workingset_init(void)
  unsigned int max_order;
  int ret;

- BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
+ BUILD_BUG_ON(BITS_PER_LONG < (EVICTION_SHIFT
+ + EVICTION_SECS_POS_SHIFT
+ + EVICTION_SECS_SHRINK_SHIFT));
  /*
  * Calculate the eviction bucket size to cover the longest
  * actionable refault distance, which is currently half of
@@ -547,7 +647,9 @@ static int __init workingset_init(void)
  * some more pages at runtime, so keep working with up to
  * double the initial memory by using totalram_pages as-is.
  */
- timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
+ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT
+ - EVICTION_SECS_POS_SHIFT - EVICTION_SECS_SHRINK_SHIFT;
+
  max_order = fls_long(totalram_pages() - 1);
  if (max_order > timestamp_bits)
  bucket_order = max_order - timestamp_bits;

On Wed, Apr 17, 2019 at 9:37 PM Matthew Wilcox <willy@...radead.org> wrote:
>
> On Wed, Apr 17, 2019 at 08:26:22PM +0800, Zhaoyang Huang wrote:
> [quoting Johannes here]
> > As Matthew says, you are fairly randomly making refault activations
> > more aggressive (especially with that timestamp unpacking bug), and
> > while that expectedly boosts workload transition / startup, it comes
> > at the cost of disrupting stable states because you can flood a very
> > active in-ram workingset with completely cold cache pages simply
> > because they refault uniformly wrt each other.
> > [HZY]: I analysis the log got from trace_printk, what we activate have
> > proven record of long refault distance but very short refault time.
>
> You haven't addressed my point, which is that you were only testing
> workloads for which your changed algorithm would improve the results.
> What you haven't done is shown how other workloads would be negatively
> affected.
>
> Once you do that, we can make a decision about whether to improve your
> workload by X% and penalise that other workload by Y%.