diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 841cee3f346d..548d1e480de9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -640,6 +640,37 @@ static bool mmu_spte_age(u64 *sptep) return true; } +/* + * Similar to mmu_spte_age(), but this one should be used for lockless shadow + * page table walks. + */ +static bool mmu_spte_age_lockless(u64 *sptep) +{ + u64 old_spte = mmu_spte_get_lockless(sptep); + u64 new_spte; + + if (!is_accessed_spte(old_spte)) + return false; + + if (spte_ad_enabled(old_spte)) + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); + else { + new_spte = mark_spte_for_access_track(old_spte); + if (!try_cmpxchg64(sptep, &old_spte, new_spte)) + /* + * If the spte changed, it's likely that the gfn + * is young. + */ + return true; + + if (is_writable_pte(old_spte)) + kvm_set_pfn_dirty(spte_to_pfn(old_spte)); + } + + return true; +} + static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu) { return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct; @@ -647,6 +678,11 @@ static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu) static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { + if (!vcpu) { + rcu_read_lock(); + return; + } + if (is_tdp_mmu_active(vcpu)) { kvm_tdp_mmu_walk_lockless_begin(); } else { @@ -666,6 +702,11 @@ static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { + if (!vcpu) { + rcu_read_unlock(); + return; + } + if (is_tdp_mmu_active(vcpu)) { kvm_tdp_mmu_walk_lockless_end(); } else { @@ -949,14 +990,14 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, int count = 0; if (!rmap_head->val) { - rmap_head->val = (unsigned long)spte; + WRITE_ONCE(&rmap_head->val, (unsigned long)spte); } else if (!(rmap_head->val & 1)) { desc = kvm_mmu_memory_cache_alloc(cache); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; desc->spte_count = 2; desc->tail_count = 0; - rmap_head->val = (unsigned long)desc | 1; + WRITE_ONCE(&rmap_head->val, (unsigned long)desc | 1); ++count; } else { desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); @@ -971,7 +1012,7 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul); desc->spte_count = 0; desc->tail_count = count; - rmap_head->val = (unsigned long)desc | 1; + WRITE_ONCE(&rmap_head->val, (unsigned long)desc | 1); } desc->sptes[desc->spte_count++] = spte; } @@ -1009,9 +1050,10 @@ static void pte_list_desc_remove_entry(struct kvm *kvm, * head at the next descriptor, i.e. the new head. */ if (!head_desc->more) - rmap_head->val = 0; + WRITE_ONCE(&rmap_head->val, 0); else - rmap_head->val = (unsigned long)head_desc->more | 1; + WRITE_ONCE(&rmap_head->val, + (unsigned long)head_desc->more | 1); mmu_free_pte_list_desc(head_desc); } @@ -1028,7 +1070,7 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte, if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm)) return; - rmap_head->val = 0; + WRITE_ONCE(&rmap_head->val, 0); } else { desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); while (desc) { @@ -1078,7 +1120,7 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, } out: /* rmap_head is meaningless now, remember to reset it */ - rmap_head->val = 0; + WRITE_ONCE(&rmap_head->val, 0); return true; } @@ -1634,17 +1676,64 @@ static bool kvm_has_shadow_mmu_sptes(struct kvm *kvm) return !tdp_mmu_enabled || READ_ONCE(kvm->arch.indirect_shadow_pages); } +static bool kvm_age_rmap_fast(u64 *sptep) +{ + return mmu_spte_age_lockless(sptep); +} + +static bool kvm_test_age_rmap_fast(u64 *sptep) +{ + return is_accessed_spte(READ_ONCE(*sptep)); +} + +typedef bool (*rmap_lockless_handler_t)(u64 *sptep); + +static __always_inline bool kvm_handle_gfn_range_lockless( + struct kvm *kvm, struct kvm_gfn_range *range, + rmap_lockless_handler_t handler) +{ + struct kvm_rmap_head *rmap; + u64 *sptep; + gfn_t gfn; + int level; + bool ret = false; + + walk_shadow_page_lockless_begin(NULL); + + for (gfn = range->start; gfn < range->end; gfn++) { + for (level = PG_LEVEL_4K; level <= KVM_MAX_HUGEPAGE_LEVEL; + level++) { + rmap = gfn_to_rmap(gfn, level, range->slot); + sptep = (void *)READ_ONCE(rmap->val); + + /* Skip this gfn if multiple SPTEs mapping it */ + if ((unsigned long)sptep & 1) + continue; + + ret |= handler(sptep); + } + } + + walk_shadow_page_lockless_end(NULL); + + return ret; +} + bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - bool young = false; + bool young = false, shadow_young = false; - if (tdp_mmu_enabled) { + if (tdp_mmu_enabled) young |= kvm_tdp_mmu_age_gfn_range(kvm, range); - if (young) - range->arg.report_fast = true; - } - if (!range->arg.fast_only && kvm_has_shadow_mmu_sptes(kvm)) { + shadow_young = kvm_handle_gfn_range_lockless(kvm, range, + kvm_age_rmap_fast); + young |= shadow_young; + if (young) + range->arg.report_fast = true; + + else if (!shadow_young && !range->arg.fast_only && + kvm_has_shadow_mmu_sptes(kvm)) { write_lock(&kvm->mmu_lock); young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap); write_unlock(&kvm->mmu_lock); @@ -1657,11 +1746,15 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; - if (tdp_mmu_enabled) { + if (tdp_mmu_enabled) young |= kvm_tdp_mmu_test_age_gfn(kvm, range); - if (young) - range->arg.report_fast = true; - } + + if (!young) + young |= kvm_handle_gfn_range_lockless(kvm, range, + kvm_test_age_rmap_fast); + + if (young) + range->arg.report_fast = true; if (!young && !range->arg.fast_only && kvm_has_shadow_mmu_sptes(kvm)) { write_lock(&kvm->mmu_lock); @@ -2636,6 +2729,12 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, */ kvm_flush_remote_tlbs(kvm); + /* + * Wait for any non-vCPU lockless shadow page table walkers to stop + * using the shadow pages we're about to free. + */ + synchronize_rcu(); + list_for_each_entry_safe(sp, nsp, invalid_list, link) { WARN_ON_ONCE(!sp->role.invalid || sp->root_count); kvm_mmu_free_shadow_page(sp);