UBUNTU: SAUCE: kvm: mmu: ITLB_MULTIHIT mitigation

author Paolo Bonzini <pbonzini@redhat.com>

Thu, 31 Oct 2019 23:33:45 +0000 (00:33 +0100)

committer Stefan Bader <stefan.bader@canonical.com>

Mon, 4 Nov 2019 17:31:50 +0000 (18:31 +0100)
author Paolo Bonzini <pbonzini@redhat.com>
Thu, 31 Oct 2019 23:33:45 +0000 (00:33 +0100)
committer Stefan Bader <stefan.bader@canonical.com>
Mon, 4 Nov 2019 17:31:50 +0000 (18:31 +0100)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt

index b250e9a00a585f995652a2fdb081b435f1d00258..0cd32c300f5733f12181bdf76892f82d3161eaa4 100644 (file)
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1995,6 +1995,17 @@
                         KVM MMU at runtime.
                         Default is 0 (off)
  
+       kvm.nx_huge_pages=
+                       [KVM] Controls the sw workaround for bug
+                       X86_BUG_ITLB_MULTIHIT.
+                       force   : Always deploy workaround.
+                       off     : Default. Never deploy workaround.
+                       auto    : Deploy workaround based on presence of
+                                 X86_BUG_ITLB_MULTIHIT.
+
+                       If the sw workaround is enabled for the host, guests
+                       need not enable it for nested guests.
+
         kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.
                         Default is 1 (enabled)
  
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index 6550e468451157bb7ceb38764376966a6d68cba4..a610e3d20ef18c3096dc46b6398df4291dfb2115 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -319,6 +319,7 @@ struct kvm_mmu_page {
         struct list_head link;
         struct hlist_node hash_link;
         bool unsync;
+       bool lpage_disallowed; /* Can't be replaced by an equiv large page */
  
         /*
          * The following two entries are used to key the shadow page in the
@@ -942,6 +943,7 @@ struct kvm_vm_stat {
         ulong mmu_unsync;
         ulong remote_tlb_flush;
         ulong lpages;
+       ulong nx_lpage_splits;
         ulong max_mmu_page_hash_collisions;
  };
  
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c

index d64a8e59d7c2eee0e9d465035a921a89e46b6069..b6c84f8ba29bec644d9c5b728e684d1cb4794ee4 100644 (file)
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1245,6 +1245,9 @@ void x86_spec_ctrl_setup_ap(void)
                 x86_amd_ssb_disable();
  }
  
+bool itlb_multihit_kvm_mitigation;
+EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
+
  #undef pr_fmt
  #define pr_fmt(fmt)    "L1TF: " fmt
  
@@ -1400,17 +1403,25 @@ static ssize_t l1tf_show_state(char *buf)
                        l1tf_vmx_states[l1tf_vmx_mitigation],
                        sched_smt_active() ? "vulnerable" : "disabled");
  }
+
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+       if (itlb_multihit_kvm_mitigation)
+               return sprintf(buf, "KVM: Mitigation: Split huge pages\n");
+       else
+               return sprintf(buf, "KVM: Vulnerable\n");
+}
  #else
  static ssize_t l1tf_show_state(char *buf)
  {
         return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
  }
-#endif
  
  static ssize_t itlb_multihit_show_state(char *buf)
  {
         return sprintf(buf, "Processor vulnerable\n");
  }
+#endif
  
  static ssize_t mds_show_state(char *buf)
  {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c

index dfb77f506e21d5acb0677d5ad85c6db2a1355ff0..8f1e50e3d23952e637140afd6e53a8bac4839f78 100644 (file)
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -49,6 +49,20 @@
  #include <asm/kvm_page_track.h>
  #include "trace.h"
  
+extern bool itlb_multihit_kvm_mitigation;
+
+static int __read_mostly nx_huge_pages = -1;
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
+
+static struct kernel_param_ops nx_huge_pages_ops = {
+       .set = set_nx_huge_pages,
+       .get = param_get_bool,
+};
+
+module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages, "bool");
+
  /*
   * When setting this variable to true it enables Two-Dimensional-Paging
   * where the hardware walks 2 page tables:
@@ -313,6 +327,11 @@ static inline bool spte_ad_enabled(u64 spte)
         return !(spte & shadow_acc_track_value);
  }
  
+static bool is_nx_huge_page_enabled(void)
+{
+       return READ_ONCE(nx_huge_pages);
+}
+
  static inline u64 spte_shadow_accessed_mask(u64 spte)
  {
         MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
@@ -1125,6 +1144,15 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
         kvm_mmu_gfn_disallow_lpage(slot, gfn);
  }
  
+static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+       if (sp->lpage_disallowed)
+               return;
+
+       ++kvm->stat.nx_lpage_splits;
+       sp->lpage_disallowed = true;
+}
+
  static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
  {
         struct kvm_memslots *slots;
@@ -1142,6 +1170,12 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
         kvm_mmu_gfn_allow_lpage(slot, gfn);
  }
  
+static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+       --kvm->stat.nx_lpage_splits;
+       sp->lpage_disallowed = false;
+}
+
  static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
                                           struct kvm_memory_slot *slot)
  {
@@ -2707,6 +2741,9 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
                         kvm_reload_remote_mmus(kvm);
         }
  
+       if (sp->lpage_disallowed)
+               unaccount_huge_nx_page(kvm, sp);
+
         sp->role.invalid = 1;
         return ret;
  }
@@ -2915,6 +2952,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
         if (!speculative)
                 spte |= spte_shadow_accessed_mask(spte);
  
+       if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
+           is_nx_huge_page_enabled()) {
+               pte_access &= ~ACC_EXEC_MASK;
+       }
+
         if (pte_access & ACC_EXEC_MASK)
                 spte |= shadow_x_mask;
         else
@@ -3135,9 +3177,32 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
         __direct_pte_prefetch(vcpu, sp, sptep);
  }
  
+static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
+                                      gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
+{
+       int level = *levelp;
+       u64 spte = *it.sptep;
+
+       if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
+           is_nx_huge_page_enabled() &&
+           is_shadow_present_pte(spte) &&
+           !is_large_pte(spte)) {
+               /*
+                * A small SPTE exists for this pfn, but FNAME(fetch)
+                * and __direct_map would like to create a large PTE
+                * instead: just force them to go down another level,
+                * patching back for them into pfn the next 9 bits of
+                * the address.
+                */
+               u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
+               *pfnp |= gfn & page_mask;
+               (*levelp)--;
+       }
+}
+
  static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
                         int map_writable, int level, kvm_pfn_t pfn,
-                       bool prefault)
+                       bool prefault, bool lpage_disallowed)
  {
         struct kvm_shadow_walk_iterator it;
         struct kvm_mmu_page *sp;
@@ -3150,6 +3215,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
  
         trace_kvm_mmu_spte_requested(gpa, level, pfn);
         for_each_shadow_entry(vcpu, gpa, it) {
+               /*
+                * We cannot overwrite existing page tables with an NX
+                * large page, as the leaf could be executable.
+                */
+               disallowed_hugepage_adjust(it, gfn, &pfn, &level);
+
                 base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
                 if (it.level == level)
                         break;
@@ -3160,6 +3231,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
                                               it.level - 1, true, ACC_ALL);
  
                         link_shadow_page(vcpu, it.sptep, sp);
+                       if (lpage_disallowed)
+                               account_huge_nx_page(vcpu->kvm, sp);
                 }
         }
  
@@ -3451,11 +3524,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
  {
         int r;
         int level;
-       bool force_pt_level = false;
+       bool force_pt_level;
         kvm_pfn_t pfn;
         unsigned long mmu_seq;
         bool map_writable, write = error_code & PFERR_WRITE_MASK;
+       bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+                               is_nx_huge_page_enabled();
  
+       force_pt_level = lpage_disallowed;
         level = mapping_level(vcpu, gfn, &force_pt_level);
         if (likely(!force_pt_level)) {
                 /*
@@ -3489,7 +3565,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
                 goto out_unlock;
         if (likely(!force_pt_level))
                 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
-       r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault);
+       r = __direct_map(vcpu, v, write, map_writable, level, pfn,
+                        prefault, false);
  out_unlock:
         spin_unlock(&vcpu->kvm->mmu_lock);
         kvm_release_pfn_clean(pfn);
@@ -4088,6 +4165,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
         unsigned long mmu_seq;
         int write = error_code & PFERR_WRITE_MASK;
         bool map_writable;
+       bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+                               is_nx_huge_page_enabled();
  
         MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
  
@@ -4098,8 +4177,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
         if (r)
                 return r;
  
-       force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
-                                                          PT_DIRECTORY_LEVEL);
+       force_pt_level =
+               lpage_disallowed ||
+               !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
         level = mapping_level(vcpu, gfn, &force_pt_level);
         if (likely(!force_pt_level)) {
                 if (level > PT_DIRECTORY_LEVEL &&
@@ -4128,7 +4208,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
                 goto out_unlock;
         if (likely(!force_pt_level))
                 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
-       r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
+       r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
+                        prefault, lpage_disallowed);
  out_unlock:
         spin_unlock(&vcpu->kvm->mmu_lock);
         kvm_release_pfn_clean(pfn);
@@ -6018,10 +6099,52 @@ static void mmu_destroy_caches(void)
         kmem_cache_destroy(mmu_page_header_cache);
  }
  
+static void __set_nx_huge_pages(bool val)
+{
+       nx_huge_pages = itlb_multihit_kvm_mitigation = val;
+}
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
+{
+       bool old_val = nx_huge_pages;
+       bool new_val;
+
+       /* In "auto" mode deploy workaround only if CPU has the bug. */
+       if (sysfs_streq(val, "off"))
+               new_val = 0;
+       else if (sysfs_streq(val, "force"))
+               new_val = 1;
+       else if (sysfs_streq(val, "auto"))
+               new_val = boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT);
+       else if (strtobool(val, &new_val) < 0)
+               return -EINVAL;
+
+       __set_nx_huge_pages(new_val);
+
+       if (new_val != old_val) {
+               struct kvm *kvm;
+               int idx;
+
+               mutex_lock(&kvm_lock);
+
+               list_for_each_entry(kvm, &vm_list, vm_list) {
+                       idx = srcu_read_lock(&kvm->srcu);
+                       kvm_mmu_invalidate_zap_all_pages(kvm);
+                       srcu_read_unlock(&kvm->srcu, idx);
+               }
+               mutex_unlock(&kvm_lock);
+       }
+
+       return 0;
+}
+
  int kvm_mmu_module_init(void)
  {
         int ret = -ENOMEM;
  
+       if (nx_huge_pages == -1)
+               __set_nx_huge_pages(boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT));
+
         /*
          * MMU roles use union aliasing which is, generally speaking, an
          * undefined behavior. However, we supposedly know how compilers behave
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h

index 81ec22da74e1aeb7d4619a9860daa393049cbfca..2dd0fe94c0c14f2d6be86c0c8847b3a8a633a80f 100644 (file)
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -597,13 +597,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
  static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                          struct guest_walker *gw,
                          int write_fault, int hlevel,
-                        kvm_pfn_t pfn, bool map_writable, bool prefault)
+                        kvm_pfn_t pfn, bool map_writable, bool prefault,
+                        bool lpage_disallowed)
  {
         struct kvm_mmu_page *sp = NULL;
         struct kvm_shadow_walk_iterator it;
         unsigned direct_access, access = gw->pt_access;
         int top_level, ret;
-       gfn_t base_gfn;
+       gfn_t gfn, base_gfn;
  
         direct_access = gw->pte_access;
  
@@ -648,13 +649,25 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                         link_shadow_page(vcpu, it.sptep, sp);
         }
  
-       base_gfn = gw->gfn;
+       /*
+        * FNAME(page_fault) might have clobbered the bottom bits of
+        * gw->gfn, restore them from the virtual address.
+        */
+       gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT);
+       base_gfn = gfn;
  
         trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
  
         for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
                 clear_sp_write_flooding_count(it.sptep);
-               base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+
+               /*
+                * We cannot overwrite existing page tables with an NX
+                * large page, as the leaf could be executable.
+                */
+               disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel);
+
+               base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
                 if (it.level == hlevel)
                         break;
  
@@ -666,6 +679,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                         sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
                                               it.level - 1, true, direct_access);
                         link_shadow_page(vcpu, it.sptep, sp);
+                       if (lpage_disallowed)
+                               account_huge_nx_page(vcpu->kvm, sp);
                 }
         }
  
@@ -742,9 +757,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
         int r;
         kvm_pfn_t pfn;
         int level = PT_PAGE_TABLE_LEVEL;
-       bool force_pt_level = false;
         unsigned long mmu_seq;
         bool map_writable, is_self_change_mapping;
+       bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+                               is_nx_huge_page_enabled();
+       bool force_pt_level = lpage_disallowed;
  
         pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
  
@@ -834,7 +851,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
         if (!force_pt_level)
                 transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);
         r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
-                        level, pfn, map_writable, prefault);
+                        level, pfn, map_writable, prefault, lpage_disallowed);
         kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
  
  out_unlock:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 34a8a69bbe24342420e929a1e77e5b7b16252240..fd8205fcb6a019461f6bb7c8510a74d5e5278578 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -211,6 +211,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
         { "mmu_unsync", VM_STAT(mmu_unsync) },
         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
         { "largepages", VM_STAT(lpages, .mode = 0444) },
+       { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
         { "max_mmu_page_hash_collisions",
                 VM_STAT(max_mmu_page_hash_collisions) },
         { NULL }
@@ -1220,6 +1221,14 @@ u64 kvm_get_arch_capabilities(void)
  
         rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
  
+       /*
+        * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
+        * the nested hypervisor runs with NX huge pages.  If it is not,
+        * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
+        * L1 guests, so it need not worry about its own (L2) guests.
+        */
+       data |= ARCH_CAP_PSCHANGE_MC_NO;
+
         /*
          * If we're doing cache flushes (either "always" or "cond")
          * we will do one whenever the guest does a vmlaunch/vmresume.
author	Paolo Bonzini <pbonzini@redhat.com>
	Thu, 31 Oct 2019 23:33:45 +0000 (00:33 +0100)
committer	Stefan Bader <stefan.bader@canonical.com>
	Mon, 4 Nov 2019 17:31:50 +0000 (18:31 +0100)
Documentation/admin-guide/kernel-parameters.txt		patch \| blob \| blame \| history
arch/x86/include/asm/kvm_host.h		patch \| blob \| blame \| history
arch/x86/kernel/cpu/bugs.c		patch \| blob \| blame \| history
arch/x86/kvm/mmu.c		patch \| blob \| blame \| history
arch/x86/kvm/paging_tmpl.h		patch \| blob \| blame \| history
arch/x86/kvm/x86.c		patch \| blob \| blame \| history