linux-kernel - Re: [PATCH] mmu_notifier, kvm: Introduce dirty bit tracking in spte and mmu notifier to help KSM dirty bit tracking

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20110622002123.GP25383@sequoia.sous-sol.org>
Date:	Tue, 21 Jun 2011 17:21:23 -0700
From:	Chris Wright <chrisw@...s-sol.org>
To:	Nai Xia <nai.xia@...il.com>
Cc:	Andrew Morton <akpm@...ux-foundation.org>,
	Izik Eidus <izik.eidus@...ellosystems.com>,
	Andrea Arcangeli <aarcange@...hat.com>,
	Hugh Dickins <hughd@...gle.com>,
	Chris Wright <chrisw@...s-sol.org>,
	Rik van Riel <riel@...hat.com>,
	linux-mm <linux-mm@...ck.org>,
	Johannes Weiner <hannes@...xchg.org>,
	linux-kernel <linux-kernel@...r.kernel.org>,
	kvm <kvm@...r.kernel.org>, mtosatti@...hat.com
Subject: Re: [PATCH] mmu_notifier, kvm: Introduce dirty bit tracking in spte
 and mmu notifier to help KSM dirty bit tracking

* Nai Xia (nai.xia@...il.com) wrote:
> Introduced kvm_mmu_notifier_test_and_clear_dirty(), kvm_mmu_notifier_dirty_update()
> and their mmu_notifier interfaces to support KSM dirty bit tracking, which brings
> significant performance gain in volatile pages scanning in KSM.
> Currently, kvm_mmu_notifier_dirty_update() returns 0 if and only if intel EPT is
> enabled to indicate that the dirty bits of underlying sptes are not updated by
> hardware.

Did you test with each of EPT, NPT and shadow?

> Signed-off-by: Nai Xia <nai.xia@...il.com>
> Acked-by: Izik Eidus <izik.eidus@...ellosystems.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    1 +
>  arch/x86/kvm/mmu.c              |   36 +++++++++++++++++++++++++++++
>  arch/x86/kvm/mmu.h              |    3 +-
>  arch/x86/kvm/vmx.c              |    1 +
>  include/linux/kvm_host.h        |    2 +-
>  include/linux/mmu_notifier.h    |   48 +++++++++++++++++++++++++++++++++++++++
>  mm/mmu_notifier.c               |   33 ++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c             |   27 ++++++++++++++++++++++
>  8 files changed, 149 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index d2ac8e2..f0d7aa0 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -848,6 +848,7 @@ extern bool kvm_rebooting;
>  int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
>  int kvm_age_hva(struct kvm *kvm, unsigned long hva);
>  int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
> +int kvm_test_and_clear_dirty_hva(struct kvm *kvm, unsigned long hva);
>  void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
>  int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
>  int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index aee3862..a5a0c51 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -979,6 +979,37 @@ out:
>  	return young;
>  }
>  
> +/*
> + * Caller is supposed to SetPageDirty(), it's not done inside this.
> + */
> +static
> +int kvm_test_and_clear_dirty_rmapp(struct kvm *kvm, unsigned long *rmapp,
> +				   unsigned long data)
> +{
> +	u64 *spte;
> +	int dirty = 0;
> +
> +	if (!shadow_dirty_mask) {
> +		WARN(1, "KVM: do NOT try to test dirty bit in EPT\n");
> +		goto out;
> +	}

This should never fire with the dirty_update() notifier test, right?
And that means that this whole optimization is for the shadow mmu case,
arguably the legacy case.

> +
> +	spte = rmap_next(kvm, rmapp, NULL);
> +	while (spte) {
> +		int _dirty;
> +		u64 _spte = *spte;
> +		BUG_ON(!(_spte & PT_PRESENT_MASK));
> +		_dirty = _spte & PT_DIRTY_MASK;
> +		if (_dirty) {
> +			dirty = 1;
> +			clear_bit(PT_DIRTY_SHIFT, (unsigned long *)spte);

Is this sufficient (not losing dirty state ever)?

> +		}
> +		spte = rmap_next(kvm, rmapp, spte);
> +	}
> +out:
> +	return dirty;
> +}
> +
>  #define RMAP_RECYCLE_THRESHOLD 1000
>  
>  static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
> @@ -1004,6 +1035,11 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
>  	return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
>  
>  
> +int kvm_test_and_clear_dirty_hva(struct kvm *kvm, unsigned long hva)
> +{
> +	return kvm_handle_hva(kvm, hva, 0, kvm_test_and_clear_dirty_rmapp);
> +}
> +
>  #ifdef MMU_DEBUG
>  static int is_empty_shadow_page(u64 *spt)
>  {
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 7086ca8..b8d01c3 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -18,7 +18,8 @@
>  #define PT_PCD_MASK (1ULL << 4)
>  #define PT_ACCESSED_SHIFT 5
>  #define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
> -#define PT_DIRTY_MASK (1ULL << 6)
> +#define PT_DIRTY_SHIFT 6
> +#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT)
>  #define PT_PAGE_SIZE_MASK (1ULL << 7)
>  #define PT_PAT_MASK (1ULL << 7)
>  #define PT_GLOBAL_MASK (1ULL << 8)
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index d48ec60..b407a69 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -4674,6 +4674,7 @@ static int __init vmx_init(void)
>  		kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
>  				VMX_EPT_EXECUTABLE_MASK);
>  		kvm_enable_tdp();
> +		kvm_dirty_update = 0;

Doesn't the above shadow_dirty_mask==0ull tell us this same info?

>  	} else
>  		kvm_disable_tdp();
>  
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 31ebb59..2036bae 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -53,7 +53,7 @@
>  struct kvm;
>  struct kvm_vcpu;
>  extern struct kmem_cache *kvm_vcpu_cache;
> -
> +extern int kvm_dirty_update;
>  /*
>   * It would be nice to use something smarter than a linear search, TBD...
>   * Thankfully we dont expect many devices to register (famous last words :),
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> index 1d1b1e1..bd6ba2d 100644
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -24,6 +24,9 @@ struct mmu_notifier_mm {
>  };
>  
>  struct mmu_notifier_ops {

Need to add a comment to describe it.  And why is it not next to
test_and_clear_dirty()?  I see how it's used, but seems as if the
test_and_clear_dirty() code could return -1 (as in dirty state unknown)
for the case where it can't track dirty bit and fall back to checksum.

> +	int (*dirty_update)(struct mmu_notifier *mn,
> +			     struct mm_struct *mm);
> +
>  	/*
>  	 * Called either by mmu_notifier_unregister or when the mm is
>  	 * being destroyed by exit_mmap, always before all pages are
> @@ -72,6 +75,16 @@ struct mmu_notifier_ops {
>  			  unsigned long address);
>  
>  	/*
> +	 * clear_flush_dirty is called after the VM is
> +	 * test-and-clearing the dirty/modified bitflag in the
> +	 * pte. This way the VM will provide proper volatile page
> +	 * testing to ksm.
> +	 */
> +	int (*test_and_clear_dirty)(struct mmu_notifier *mn,
> +				    struct mm_struct *mm,
> +				    unsigned long address);
> +
> +	/*
>  	 * change_pte is called in cases that pte mapping to page is changed:
>  	 * for example, when ksm remaps pte to point to a new shared page.
>  	 */
> @@ -170,11 +183,14 @@ extern int __mmu_notifier_register(struct mmu_notifier *mn,
>  extern void mmu_notifier_unregister(struct mmu_notifier *mn,
>  				    struct mm_struct *mm);
>  extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
> +extern int __mmu_notifier_dirty_update(struct mm_struct *mm);
>  extern void __mmu_notifier_release(struct mm_struct *mm);
>  extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
>  					  unsigned long address);
>  extern int __mmu_notifier_test_young(struct mm_struct *mm,
>  				     unsigned long address);
> +extern int __mmu_notifier_test_and_clear_dirty(struct mm_struct *mm,
> +					       unsigned long address);
>  extern void __mmu_notifier_change_pte(struct mm_struct *mm,
>  				      unsigned long address, pte_t pte);
>  extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
> @@ -184,6 +200,19 @@ extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
>  extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
>  				  unsigned long start, unsigned long end);
>  
> +/*
> + * For ksm to make use of dirty bit, it wants to make sure that the dirty bits
> + * in sptes really carry the dirty information. Currently only intel EPT is
> + * not for ksm dirty bit tracking.
> + */
> +static inline int mmu_notifier_dirty_update(struct mm_struct *mm)
> +{
> +	if (mm_has_notifiers(mm))
> +		return __mmu_notifier_dirty_update(mm);
> +

No need for extra newline.

> +	return 1;
> +}
> +

> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 96ebc06..22967c8 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -78,6 +78,8 @@ static atomic_t hardware_enable_failed;
>  struct kmem_cache *kvm_vcpu_cache;
>  EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
>  
> +int kvm_dirty_update = 1;
> +
>  static __read_mostly struct preempt_ops kvm_preempt_ops;
>  
>  struct dentry *kvm_debugfs_dir;
> @@ -398,6 +400,23 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
>  	return young;
>  }
>  
> +/* Caller should SetPageDirty(), no need to flush tlb */
> +static int kvm_mmu_notifier_test_and_clear_dirty(struct mmu_notifier *mn,
> +						 struct mm_struct *mm,
> +						 unsigned long address)
> +{
> +	struct kvm *kvm = mmu_notifier_to_kvm(mn);
> +	int dirty, idx;

Perhaps something like:

	if (!shadow_dirty_mask)
		return -1;

And adjust caller logic accordingly?

> +     idx = srcu_read_lock(&kvm->srcu);
> +     spin_lock(&kvm->mmu_lock);
> +     dirty = kvm_test_and_clear_dirty_hva(kvm, address);
> +     spin_unlock(&kvm->mmu_lock);
> +     srcu_read_unlock(&kvm->srcu, idx);
> +
> +     return dirty;
> +}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/