Version in base suite: 4.14.4+74-gd7b22226b5-1 Base version: xen_4.14.4+74-gd7b22226b5-1 Target version: xen_4.14.5+24-g87d90d511c-1 Base file: /srv/ftp-master.debian.org/ftp/pool/main/x/xen/xen_4.14.4+74-gd7b22226b5-1.dsc Target file: /srv/ftp-master.debian.org/policy/pool/main/x/xen/xen_4.14.5+24-g87d90d511c-1.dsc Config.mk | 6 debian/changelog | 19 debian/patches/0003-version.patch | 8 debian/patches/0017-Fix-empty-fields-in-first-hypervisor-log-line.patch | 2 debian/patches/0043-x86-ACPI-fix-S3-wakeup-vector-mapping.patch | 2 debian/patches/prefix-abiname/config-prefix.diff | 2 docs/misc/xen-command-line.pandoc | 54 + tools/libxl/libxl_cpuid.c | 1 tools/misc/xen-cpuid.c | 2 xen/Makefile | 2 xen/arch/arm/arm32/livepatch.c | 7 xen/arch/arm/arm64/livepatch.c | 7 xen/arch/x86/acpi/power.c | 8 xen/arch/x86/cpu/amd.c | 46 + xen/arch/x86/cpu/cpu.h | 1 xen/arch/x86/cpu/hygon.c | 6 xen/arch/x86/cpuid.c | 7 xen/arch/x86/domain.c | 14 xen/arch/x86/flushtlb.c | 95 ++ xen/arch/x86/hvm/svm/entry.S | 18 xen/arch/x86/hvm/svm/vmcb.c | 9 xen/arch/x86/hvm/vmx/entry.S | 2 xen/arch/x86/hvm/vmx/vmcs.c | 4 xen/arch/x86/livepatch.c | 7 xen/arch/x86/mm.c | 301 ++++---- xen/arch/x86/pv/grant_table.c | 21 xen/arch/x86/spec_ctrl.c | 346 ++++++++-- xen/arch/x86/x86_64/compat/entry.S | 4 xen/arch/x86/x86_64/entry.S | 14 xen/common/grant_table.c | 4 xen/common/kernel.c | 20 xen/common/livepatch.c | 16 xen/common/livepatch_elf.c | 6 xen/drivers/passthrough/vtd/extern.h | 3 xen/drivers/passthrough/vtd/iommu.c | 99 -- xen/drivers/passthrough/vtd/quirks.c | 2 xen/drivers/passthrough/vtd/vtd.h | 3 xen/drivers/passthrough/vtd/x86/vtd.c | 5 xen/drivers/vpci/msix.c | 11 xen/include/asm-x86/cache.h | 7 xen/include/asm-x86/cpufeatures.h | 8 xen/include/asm-x86/domain.h | 2 xen/include/asm-x86/mm.h | 29 xen/include/asm-x86/msr-index.h | 7 xen/include/asm-x86/page.h | 12 xen/include/asm-x86/spec_ctrl.h | 43 + xen/include/asm-x86/spec_ctrl_asm.h | 95 ++ xen/include/public/arch-x86/cpufeatureset.h | 1 xen/include/xen/lib.h | 3 xen/include/xen/livepatch_elf.h | 3 50 files changed, 1014 insertions(+), 380 deletions(-) diff -Nru xen-4.14.4+74-gd7b22226b5/Config.mk xen-4.14.5+24-g87d90d511c/Config.mk --- xen-4.14.4+74-gd7b22226b5/Config.mk 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/Config.mk 2022-07-12 15:31:49.000000000 +0000 @@ -244,15 +244,15 @@ MINIOS_UPSTREAM_URL ?= git://xenbits.xen.org/mini-os.git endif OVMF_UPSTREAM_REVISION ?= 20d2e5a125e34fc8501026613a71549b2a1a3e54 -QEMU_UPSTREAM_REVISION ?= qemu-xen-4.14.4 -MINIOS_UPSTREAM_REVISION ?= xen-RELEASE-4.14.4 +QEMU_UPSTREAM_REVISION ?= qemu-xen-4.14.5 +MINIOS_UPSTREAM_REVISION ?= xen-RELEASE-4.14.5 SEABIOS_UPSTREAM_REVISION ?= rel-1.13.0 ETHERBOOT_NICS ?= rtl8139 8086100e -QEMU_TRADITIONAL_REVISION ?= xen-4.14.4 +QEMU_TRADITIONAL_REVISION ?= xen-4.14.5 # Specify which qemu-dm to use. This may be `ioemu' to use the old # Mercurial in-tree version, or a local directory, or a git URL. diff -Nru xen-4.14.4+74-gd7b22226b5/debian/changelog xen-4.14.5+24-g87d90d511c/debian/changelog --- xen-4.14.4+74-gd7b22226b5/debian/changelog 2022-04-08 09:40:51.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/debian/changelog 2022-07-13 14:28:39.000000000 +0000 @@ -1,3 +1,22 @@ +xen (4.14.5+24-g87d90d511c-1) bullseye-security; urgency=medium + + * Update to new upstream version 4.14.5+24-g87d90d511c, which also contains + security fixes for the following issues: + for the following issues: + - x86 pv: Race condition in typeref acquisition + XSA-401 CVE-2022-26362 + - x86 pv: Insufficient care with non-coherent mappings + XSA-402 CVE-2022-26363 CVE-2022-26364 + - x86: MMIO Stale Data vulnerabilities + XSA-404 CVE-2022-21123 CVE-2022-21125 CVE-2022-21166 + - Retbleed - arbitrary speculative code execution with return instructions + XSA-407 CVE-2022-23816 CVE-2022-23825 CVE-2022-29900 + * Note that the following XSA are not listed, because... + - XSA-403 patches are not applied to stable branch lines. + - XSA-405 and XSA-406 have patches for the Linux kernel. + + -- Hans van Kranenburg Wed, 13 Jul 2022 16:28:39 +0200 + xen (4.14.4+74-gd7b22226b5-1) bullseye-security; urgency=medium * Update to new upstream version 4.14.4+74-gd7b22226b5, which also contains diff -Nru xen-4.14.4+74-gd7b22226b5/debian/patches/0003-version.patch xen-4.14.5+24-g87d90d511c/debian/patches/0003-version.patch --- xen-4.14.4+74-gd7b22226b5/debian/patches/0003-version.patch 2022-04-08 09:40:51.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/debian/patches/0003-version.patch 2022-07-13 14:28:39.000000000 +0000 @@ -12,7 +12,7 @@ 6 files changed, 31 insertions(+), 31 deletions(-) diff --git a/xen/Makefile b/xen/Makefile -index 7c98e4a..2cf10a4 100644 +index 46c8177..ac117b5 100644 --- a/xen/Makefile +++ b/xen/Makefile @@ -382,7 +382,7 @@ delete-unfresh-files: @@ -39,10 +39,10 @@ include/asm-$(TARGET_ARCH)/asm-offsets.h: arch/$(TARGET_ARCH)/asm-offsets.s diff --git a/xen/common/kernel.c b/xen/common/kernel.c -index c3a943f..12bdf9d 100644 +index f07ff41..e41525d 100644 --- a/xen/common/kernel.c +++ b/xen/common/kernel.c -@@ -398,9 +398,9 @@ static int __init buildinfo_init(void) +@@ -410,9 +410,9 @@ static int __init buildinfo_init(void) hypfs_add_dir(&buildinfo, &compileinfo, true); hypfs_string_set_reference(&compiler, xen_compiler()); @@ -54,7 +54,7 @@ hypfs_add_leaf(&compileinfo, &compiler, true); hypfs_add_leaf(&compileinfo, &compile_by, true); hypfs_add_leaf(&compileinfo, &compile_date, true); -@@ -481,8 +481,8 @@ DO(xen_version)(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) +@@ -493,8 +493,8 @@ DO(xen_version)(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) memset(&info, 0, sizeof(info)); safe_strcpy(info.compiler, deny ? xen_deny() : xen_compiler()); diff -Nru xen-4.14.4+74-gd7b22226b5/debian/patches/0017-Fix-empty-fields-in-first-hypervisor-log-line.patch xen-4.14.5+24-g87d90d511c/debian/patches/0017-Fix-empty-fields-in-first-hypervisor-log-line.patch --- xen-4.14.4+74-gd7b22226b5/debian/patches/0017-Fix-empty-fields-in-first-hypervisor-log-line.patch 2022-04-08 09:40:51.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/debian/patches/0017-Fix-empty-fields-in-first-hypervisor-log-line.patch 2022-07-13 14:28:39.000000000 +0000 @@ -28,7 +28,7 @@ 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xen/Makefile b/xen/Makefile -index 2cf10a4..3d6f733 100644 +index ac117b5..3c0ec6b 100644 --- a/xen/Makefile +++ b/xen/Makefile @@ -394,9 +394,9 @@ include/xen/compile.h: include/xen/compile.h.in diff -Nru xen-4.14.4+74-gd7b22226b5/debian/patches/0043-x86-ACPI-fix-S3-wakeup-vector-mapping.patch xen-4.14.5+24-g87d90d511c/debian/patches/0043-x86-ACPI-fix-S3-wakeup-vector-mapping.patch --- xen-4.14.4+74-gd7b22226b5/debian/patches/0043-x86-ACPI-fix-S3-wakeup-vector-mapping.patch 2022-04-08 09:40:51.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/debian/patches/0043-x86-ACPI-fix-S3-wakeup-vector-mapping.patch 2022-07-13 14:28:39.000000000 +0000 @@ -41,7 +41,7 @@ offsetof(struct acpi_table_facs, firmware_waking_vector); acpi_sinfo.vector_width = 32; diff --git a/xen/arch/x86/acpi/power.c b/xen/arch/x86/acpi/power.c -index 774e0fc..dfd8611 100644 +index 06f3e0e..9875708 100644 --- a/xen/arch/x86/acpi/power.c +++ b/xen/arch/x86/acpi/power.c @@ -174,17 +174,20 @@ static void acpi_sleep_prepare(u32 state) diff -Nru xen-4.14.4+74-gd7b22226b5/debian/patches/prefix-abiname/config-prefix.diff xen-4.14.5+24-g87d90d511c/debian/patches/prefix-abiname/config-prefix.diff --- xen-4.14.4+74-gd7b22226b5/debian/patches/prefix-abiname/config-prefix.diff 2022-04-08 09:40:51.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/debian/patches/prefix-abiname/config-prefix.diff 2022-07-13 14:28:39.000000000 +0000 @@ -9,7 +9,7 @@ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Config.mk b/Config.mk -index 51910c5..8655152 100644 +index ba5eb4e..99364f6 100644 --- a/Config.mk +++ b/Config.mk @@ -74,7 +74,7 @@ EXTRA_LIB += $(EXTRA_PREFIX)/lib diff -Nru xen-4.14.4+74-gd7b22226b5/docs/misc/xen-command-line.pandoc xen-4.14.5+24-g87d90d511c/docs/misc/xen-command-line.pandoc --- xen-4.14.4+74-gd7b22226b5/docs/misc/xen-command-line.pandoc 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/docs/misc/xen-command-line.pandoc 2022-07-12 15:31:49.000000000 +0000 @@ -2104,9 +2104,11 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). ### spec-ctrl (x86) -> `= List of [ , xen=, {pv,hvm,msr-sc,rsb,md-clear}=, -> bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu, -> l1d-flush,branch-harden,srb-lock}= ]` +> `= List of [ , xen=, {pv,hvm}=, +> {msr-sc,rsb,md-clear,ibpb-entry}=|{pv,hvm}=, +> bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd, +> eager-fpu,l1d-flush,branch-harden,srb-lock, +> unpriv-mmio}= ]` Controls for speculative execution sidechannel mitigations. By default, Xen will pick the most appropriate mitigations based on compiled in support, @@ -2128,13 +2130,18 @@ Use of a positive boolean value for either of these options is invalid. -The booleans `pv=`, `hvm=`, `msr-sc=`, `rsb=` and `md-clear=` offer fine -grained control over the alternative blocks used by Xen. These impact Xen's -ability to protect itself, and Xen's ability to virtualise support for guests -to use. +The `pv=`, `hvm=`, `msr-sc=`, `rsb=`, `md-clear=` and `ibpb-entry=` options +offer fine grained control over the primitives by Xen. These impact Xen's +ability to protect itself, and/or Xen's ability to virtualise support for +guests to use. * `pv=` and `hvm=` offer control over all suboptions for PV and HVM guests respectively. +* Each other option can be used either as a plain boolean + (e.g. `spec-ctrl=rsb` to control both the PV and HVM sub-options), or with + `pv=` or `hvm=` subsuboptions (e.g. `spec-ctrl=rsb=no-hvm` to disable HVM + RSB only). + * `msr-sc=` offers control over Xen's support for manipulating `MSR_SPEC_CTRL` on entry and exit. These blocks are necessary to virtualise support for guests and if disabled, guests will be unable to use IBRS/STIBP/SSBD/etc. @@ -2145,6 +2152,11 @@ compatibility with development versions of this fix, `mds=` is also accepted on Xen 4.12 and earlier as an alias. Consult vendor documentation in preference to here.* +* `ibpb-entry=` offers control over whether IBPB (Indirect Branch Prediction + Barrier) is used on entry to Xen. This is used by default on hardware + vulnerable to Branch Type Confusion, but for performance reasons, dom0 is + unprotected by default. If it necessary to protect dom0 too, boot with + `spec-ctrl=ibpb-entry`. If Xen was compiled with INDIRECT_THUNK support, `bti-thunk=` can be used to select which of the thunks gets patched into the `__x86_indirect_thunk_%reg` @@ -2157,9 +2169,10 @@ If Xen is not using IBRS itself, functionality is still set up so IBRS can be virtualised for guests. -On hardware supporting IBPB (Indirect Branch Prediction Barrier), the `ibpb=` -option can be used to force (the default) or prevent Xen from issuing branch -prediction barriers on vcpu context switches. +On hardware supporting STIBP (Single Thread Indirect Branch Predictors), the +`stibp=` option can be used to force or prevent Xen using the feature itself. +By default, Xen will use STIBP when IBRS is in use (IBRS implies STIBP), and +when hardware hints recommend using it as a blanket setting. On hardware supporting SSBD (Speculative Store Bypass Disable), the `ssbd=` option can be used to force or prevent Xen using the feature itself. On AMD @@ -2167,6 +2180,15 @@ guest use. On Intel hardware, the feature is virtualised for guests, independently of Xen's choice of setting. +On hardware supporting PSFD (Predictive Store Forwarding Disable), the `psfd=` +option can be used to force or prevent Xen using the feature itself. By +default, Xen will not use PSFD. PSFD is implied by SSBD, and SSBD is off by +default. + +On hardware supporting IBPB (Indirect Branch Prediction Barrier), the `ibpb=` +option can be used to force (the default) or prevent Xen from issuing branch +prediction barriers on vcpu context switches. + On all hardware, the `eager-fpu=` option can be used to force or prevent Xen from using fully eager FPU context switches. This is currently implemented as a global control. By default, Xen will choose to use fully eager context @@ -2186,8 +2208,16 @@ On hardware supporting SRBDS_CTRL, the `srb-lock=` option can be used to force or prevent Xen from protect the Special Register Buffer from leaking stale data. By default, Xen will enable this mitigation, except on parts where MDS -is fixed and TAA is fixed/mitigated (in which case, there is believed to be no -way for an attacker to obtain the stale data). +is fixed and TAA is fixed/mitigated and there are no unprivileged MMIO +mappings (in which case, there is believed to be no way for an attacker to +obtain stale data). + +The `unpriv-mmio=` boolean indicates whether the system has (or will have) +less than fully privileged domains granted access to MMIO devices. By +default, this option is disabled. If enabled, Xen will use the `FB_CLEAR` +and/or `SRBDS_CTRL` functionality available in the Intel May 2022 microcode +release to mitigate cross-domain leakage of data via the MMIO Stale Data +vulnerabilities. ### sync_console > `= ` diff -Nru xen-4.14.4+74-gd7b22226b5/tools/libxl/libxl_cpuid.c xen-4.14.5+24-g87d90d511c/tools/libxl/libxl_cpuid.c --- xen-4.14.4+74-gd7b22226b5/tools/libxl/libxl_cpuid.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/tools/libxl/libxl_cpuid.c 2022-07-12 15:31:49.000000000 +0000 @@ -280,6 +280,7 @@ {"virt-ssbd", 0x80000008, NA, CPUID_REG_EBX, 25, 1}, {"ssb-no", 0x80000008, NA, CPUID_REG_EBX, 26, 1}, {"psfd", 0x80000008, NA, CPUID_REG_EBX, 28, 1}, + {"btc-no", 0x80000008, NA, CPUID_REG_EBX, 29, 1}, {"nc", 0x80000008, NA, CPUID_REG_ECX, 0, 8}, {"apicidsize", 0x80000008, NA, CPUID_REG_ECX, 12, 4}, diff -Nru xen-4.14.4+74-gd7b22226b5/tools/misc/xen-cpuid.c xen-4.14.5+24-g87d90d511c/tools/misc/xen-cpuid.c --- xen-4.14.4+74-gd7b22226b5/tools/misc/xen-cpuid.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/tools/misc/xen-cpuid.c 2022-07-12 15:31:49.000000000 +0000 @@ -157,7 +157,7 @@ /* [22] */ [23] = "ppin", [24] = "amd-ssbd", [25] = "virt-ssbd", [26] = "ssb-no", - [28] = "psfd", + [28] = "psfd", [29] = "btc-no", }; static const char *const str_7d0[32] = diff -Nru xen-4.14.4+74-gd7b22226b5/xen/Makefile xen-4.14.5+24-g87d90d511c/xen/Makefile --- xen-4.14.4+74-gd7b22226b5/xen/Makefile 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/Makefile 2022-07-12 15:31:49.000000000 +0000 @@ -2,7 +2,7 @@ # All other places this is stored (eg. compile.h) should be autogenerated. export XEN_VERSION = 4 export XEN_SUBVERSION = 14 -export XEN_EXTRAVERSION ?= .5-pre$(XEN_VENDORVERSION) +export XEN_EXTRAVERSION ?= .5$(XEN_VENDORVERSION) export XEN_FULLVERSION = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION) -include xen-version diff -Nru xen-4.14.4+74-gd7b22226b5/xen/arch/arm/arm32/livepatch.c xen-4.14.5+24-g87d90d511c/xen/arch/arm/arm32/livepatch.c --- xen-4.14.4+74-gd7b22226b5/xen/arch/arm/arm32/livepatch.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/arch/arm/arm32/livepatch.c 2022-07-12 15:31:49.000000000 +0000 @@ -272,6 +272,13 @@ elf->name, symndx); return -EINVAL; } + else if ( elf->sym[symndx].ignored ) + { + printk(XENLOG_ERR LIVEPATCH + "%s: Relocation against ignored symbol %s cannot be resolved\n", + elf->name, elf->sym[symndx].name); + return -EINVAL; + } val = elf->sym[symndx].sym->st_value; /* S */ diff -Nru xen-4.14.4+74-gd7b22226b5/xen/arch/arm/arm64/livepatch.c xen-4.14.5+24-g87d90d511c/xen/arch/arm/arm64/livepatch.c --- xen-4.14.4+74-gd7b22226b5/xen/arch/arm/arm64/livepatch.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/arch/arm/arm64/livepatch.c 2022-07-12 15:31:49.000000000 +0000 @@ -270,6 +270,13 @@ elf->name, symndx); return -EINVAL; } + else if ( elf->sym[symndx].ignored ) + { + printk(XENLOG_ERR LIVEPATCH + "%s: Relocation against ignored symbol %s cannot be resolved\n", + elf->name, elf->sym[symndx].name); + return -EINVAL; + } val = elf->sym[symndx].sym->st_value + r->r_addend; /* S+A */ diff -Nru xen-4.14.4+74-gd7b22226b5/xen/arch/x86/acpi/power.c xen-4.14.5+24-g87d90d511c/xen/arch/x86/acpi/power.c --- xen-4.14.4+74-gd7b22226b5/xen/arch/x86/acpi/power.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/arch/x86/acpi/power.c 2022-07-12 15:31:49.000000000 +0000 @@ -246,8 +246,8 @@ error = 0; ci = get_cpu_info(); - /* Avoid NMI/#MC using MSR_SPEC_CTRL until we've reloaded microcode. */ - ci->spec_ctrl_flags &= ~SCF_ist_wrmsr; + /* Avoid NMI/#MC using unsafe MSRs until we've reloaded microcode. */ + ci->spec_ctrl_flags &= ~SCF_IST_MASK; ACPI_FLUSH_CPU_CACHE(); @@ -290,8 +290,8 @@ if ( !recheck_cpu_features(0) ) panic("Missing previously available feature(s)\n"); - /* Re-enabled default NMI/#MC use of MSR_SPEC_CTRL. */ - ci->spec_ctrl_flags |= (default_spec_ctrl_flags & SCF_ist_wrmsr); + /* Re-enabled default NMI/#MC use of MSRs now microcode is loaded. */ + ci->spec_ctrl_flags |= (default_spec_ctrl_flags & SCF_IST_MASK); if ( boot_cpu_has(X86_FEATURE_IBRSB) || boot_cpu_has(X86_FEATURE_IBRS) ) { diff -Nru xen-4.14.4+74-gd7b22226b5/xen/arch/x86/cpu/amd.c xen-4.14.5+24-g87d90d511c/xen/arch/x86/cpu/amd.c --- xen-4.14.4+74-gd7b22226b5/xen/arch/x86/cpu/amd.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/arch/x86/cpu/amd.c 2022-07-12 15:31:49.000000000 +0000 @@ -731,6 +731,31 @@ printk_once(XENLOG_ERR "No SSBD controls available\n"); } +/* + * On Zen2 we offer this chicken (bit) on the altar of Speculation. + * + * Refer to the AMD Branch Type Confusion whitepaper: + * https://XXX + * + * Setting this unnamed bit supposedly causes prediction information on + * non-branch instructions to be ignored. It is to be set unilaterally in + * newer microcode. + * + * This chickenbit is something unrelated on Zen1, and Zen1 vs Zen2 isn't a + * simple model number comparison, so use STIBP as a heuristic to separate the + * two uarches in Fam17h(AMD)/18h(Hygon). + */ +void amd_init_spectral_chicken(void) +{ + uint64_t val, chickenbit = 1 << 1; + + if (cpu_has_hypervisor || !boot_cpu_has(X86_FEATURE_AMD_STIBP)) + return; + + if (rdmsr_safe(MSR_AMD64_DE_CFG2, val) == 0 && !(val & chickenbit)) + wrmsr_safe(MSR_AMD64_DE_CFG2, val | chickenbit); +} + static void init_amd(struct cpuinfo_x86 *c) { u32 l, h; @@ -783,10 +808,21 @@ amd_init_ssbd(c); + if (c->x86 == 0x17) + amd_init_spectral_chicken(); + /* MFENCE stops RDTSC speculation */ if (!cpu_has_lfence_dispatch) __set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability); + /* + * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with + * everything, including reads and writes to address, and + * LFENCE/SFENCE instructions. + */ + if (!cpu_has_clflushopt) + setup_force_cpu_cap(X86_BUG_CLFLUSH_MFENCE); + switch(c->x86) { case 0xf ... 0x11: @@ -814,6 +850,16 @@ warning_add(text); } break; + + case 0x19: + /* + * Zen3 (Fam19h model < 0x10) parts are not susceptible to + * Branch Type Confusion, but predate the allocation of the + * BTC_NO bit. Fill it back in if we're not virtualised. + */ + if (!cpu_has_hypervisor && !cpu_has(c, X86_FEATURE_BTC_NO)) + __set_bit(X86_FEATURE_BTC_NO, c->x86_capability); + break; } display_cacheinfo(c); diff -Nru xen-4.14.4+74-gd7b22226b5/xen/arch/x86/cpu/cpu.h xen-4.14.5+24-g87d90d511c/xen/arch/x86/cpu/cpu.h --- xen-4.14.4+74-gd7b22226b5/xen/arch/x86/cpu/cpu.h 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/arch/x86/cpu/cpu.h 2022-07-12 15:31:49.000000000 +0000 @@ -22,3 +22,4 @@ void amd_log_freq(const struct cpuinfo_x86 *c); void amd_init_lfence(struct cpuinfo_x86 *c); void amd_init_ssbd(const struct cpuinfo_x86 *c); +void amd_init_spectral_chicken(void); diff -Nru xen-4.14.4+74-gd7b22226b5/xen/arch/x86/cpu/hygon.c xen-4.14.5+24-g87d90d511c/xen/arch/x86/cpu/hygon.c --- xen-4.14.4+74-gd7b22226b5/xen/arch/x86/cpu/hygon.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/arch/x86/cpu/hygon.c 2022-07-12 15:31:49.000000000 +0000 @@ -36,6 +36,12 @@ amd_init_ssbd(c); + /* + * TODO: Check heuristic safety with Hygon first + if (c->x86 == 0x18) + amd_init_spectral_chicken(); + */ + /* MFENCE stops RDTSC speculation */ if (!cpu_has_lfence_dispatch) __set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability); diff -Nru xen-4.14.4+74-gd7b22226b5/xen/arch/x86/cpuid.c xen-4.14.5+24-g87d90d511c/xen/arch/x86/cpuid.c --- xen-4.14.4+74-gd7b22226b5/xen/arch/x86/cpuid.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/arch/x86/cpuid.c 2022-07-12 15:31:49.000000000 +0000 @@ -289,8 +289,15 @@ zero_leaves(p->extd.raw, 0xb, 0x18); + /* 0x19 - TLB details. Pass through. */ + /* 0x1a - Perf hints. Pass through. */ + p->extd.raw[0x1b] = EMPTY_LEAF; /* IBS - not supported. */ p->extd.raw[0x1c] = EMPTY_LEAF; /* LWP - not supported. */ + p->extd.raw[0x1d] = EMPTY_LEAF; /* TopoExt Cache */ + p->extd.raw[0x1e] = EMPTY_LEAF; /* TopoExt APIC ID/Core/Node */ + p->extd.raw[0x1f] = EMPTY_LEAF; /* SEV */ + p->extd.raw[0x20] = EMPTY_LEAF; /* Platform QoS */ break; } } diff -Nru xen-4.14.4+74-gd7b22226b5/xen/arch/x86/domain.c xen-4.14.5+24-g87d90d511c/xen/arch/x86/domain.c --- xen-4.14.4+74-gd7b22226b5/xen/arch/x86/domain.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/arch/x86/domain.c 2022-07-12 15:31:49.000000000 +0000 @@ -651,6 +651,8 @@ domain_cpu_policy_changed(d); + spec_ctrl_init_domain(d); + return 0; fail: @@ -1763,14 +1765,15 @@ void context_switch(struct vcpu *prev, struct vcpu *next) { unsigned int cpu = smp_processor_id(); + struct cpu_info *info = get_cpu_info(); const struct domain *prevd = prev->domain, *nextd = next->domain; unsigned int dirty_cpu = read_atomic(&next->dirty_cpu); ASSERT(prev != next); ASSERT(local_irq_is_enabled()); - get_cpu_info()->use_pv_cr3 = false; - get_cpu_info()->xen_cr3 = 0; + info->use_pv_cr3 = false; + info->xen_cr3 = 0; if ( unlikely(dirty_cpu != cpu) && dirty_cpu != VCPU_CPU_CLEAN ) { @@ -1807,7 +1810,7 @@ ctxt_switch_levelling(next); - if ( opt_ibpb && !is_idle_domain(nextd) ) + if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) ) { static DEFINE_PER_CPU(unsigned int, last); unsigned int *last_id = &this_cpu(last); @@ -1834,6 +1837,11 @@ *last_id = next_id; } } + + /* Update the top-of-stack block with the new spec_ctrl settings. */ + info->spec_ctrl_flags = + (info->spec_ctrl_flags & ~SCF_DOM_MASK) | + (nextd->arch.spec_ctrl_flags & SCF_DOM_MASK); } sched_context_switched(prev, next); diff -Nru xen-4.14.4+74-gd7b22226b5/xen/arch/x86/flushtlb.c xen-4.14.5+24-g87d90d511c/xen/arch/x86/flushtlb.c --- xen-4.14.4+74-gd7b22226b5/xen/arch/x86/flushtlb.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/arch/x86/flushtlb.c 2022-07-12 15:31:49.000000000 +0000 @@ -234,7 +234,7 @@ if ( flags & FLUSH_CACHE ) { const struct cpuinfo_x86 *c = ¤t_cpu_data; - unsigned long i, sz = 0; + unsigned long sz = 0; if ( order < (BITS_PER_LONG - PAGE_SHIFT) ) sz = 1UL << (order + PAGE_SHIFT); @@ -244,13 +244,7 @@ c->x86_clflush_size && c->x86_cache_size && sz && ((sz >> 10) < c->x86_cache_size) ) { - alternative("", "sfence", X86_FEATURE_CLFLUSHOPT); - for ( i = 0; i < sz; i += c->x86_clflush_size ) - alternative_input(".byte " __stringify(NOP_DS_PREFIX) ";" - " clflush %0", - "data16 clflush %0", /* clflushopt */ - X86_FEATURE_CLFLUSHOPT, - "m" (((const char *)va)[i])); + cache_flush(va, sz); flags &= ~FLUSH_CACHE; } else @@ -265,6 +259,91 @@ return flags; } +/* + * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with everything, + * including reads and writes to address, and LFENCE/SFENCE instructions. + * + * This function only works safely after alternatives have run. Luckily, at + * the time of writing, we don't flush the caches that early. + */ +void cache_flush(const void *addr, unsigned int size) +{ + /* + * This function may be called before current_cpu_data is established. + * Hence a fallback is needed to prevent the loop below becoming infinite. + */ + unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16; + const void *end = addr + size; + + alternative("", "mfence", X86_BUG_CLFLUSH_MFENCE); + + addr -= (unsigned long)addr & (clflush_size - 1); + for ( ; addr < end; addr += clflush_size ) + { + /* + * Note regarding the "ds" prefix use: it's faster to do a clflush + * + prefix than a clflush + nop, and hence the prefix is added instead + * of letting the alternative framework fill the gap by appending nops. + */ + alternative_io("ds; clflush %[p]", + "data16 clflush %[p]", /* clflushopt */ + X86_FEATURE_CLFLUSHOPT, + /* no outputs */, + [p] "m" (*(const char *)(addr))); + } + + alternative_2("", + "sfence", X86_FEATURE_CLFLUSHOPT, + "mfence", X86_BUG_CLFLUSH_MFENCE); +} + +void cache_writeback(const void *addr, unsigned int size) +{ + unsigned int clflush_size; + const void *end = addr + size; + + /* Fall back to CLFLUSH{,OPT} when CLWB isn't available. */ + if ( !boot_cpu_has(X86_FEATURE_CLWB) ) + return cache_flush(addr, size); + + /* + * This function may be called before current_cpu_data is established. + * Hence a fallback is needed to prevent the loop below becoming infinite. + */ + clflush_size = current_cpu_data.x86_clflush_size ?: 16; + addr -= (unsigned long)addr & (clflush_size - 1); + for ( ; addr < end; addr += clflush_size ) + { +/* + * The arguments to a macro must not include preprocessor directives. Doing so + * results in undefined behavior, so we have to create some defines here in + * order to avoid it. + */ +#if defined(HAVE_AS_CLWB) +# define CLWB_ENCODING "clwb %[p]" +#elif defined(HAVE_AS_XSAVEOPT) +# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */ +#else +# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */ +#endif + +#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr)) +#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT) +# define INPUT BASE_INPUT +#else +# define INPUT(addr) "a" (addr), BASE_INPUT(addr) +#endif + + asm volatile (CLWB_ENCODING :: INPUT(addr)); + +#undef INPUT +#undef BASE_INPUT +#undef CLWB_ENCODING + } + + asm volatile ("sfence" ::: "memory"); +} + unsigned int guest_flush_tlb_flags(const struct domain *d) { bool shadow = paging_mode_shadow(d); diff -Nru xen-4.14.4+74-gd7b22226b5/xen/arch/x86/hvm/svm/entry.S xen-4.14.5+24-g87d90d511c/xen/arch/x86/hvm/svm/entry.S --- xen-4.14.4+74-gd7b22226b5/xen/arch/x86/hvm/svm/entry.S 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/arch/x86/hvm/svm/entry.S 2022-07-12 15:31:49.000000000 +0000 @@ -101,7 +101,19 @@ GET_CURRENT(bx) - /* SPEC_CTRL_ENTRY_FROM_SVM Req: %rsp=regs/cpuinfo Clob: acd */ + /* SPEC_CTRL_ENTRY_FROM_SVM Req: %rsp=regs/cpuinfo, %rdx=0 Clob: acd */ + + .macro svm_vmexit_cond_ibpb + testb $SCF_entry_ibpb, CPUINFO_xen_spec_ctrl(%rsp) + jz .L_skip_ibpb + + mov $MSR_PRED_CMD, %ecx + mov $PRED_CMD_IBPB, %eax + wrmsr +.L_skip_ibpb: + .endm + ALTERNATIVE "", svm_vmexit_cond_ibpb, X86_FEATURE_IBPB_ENTRY_HVM + ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_HVM .macro svm_vmexit_spec_ctrl @@ -118,6 +130,10 @@ ALTERNATIVE "", svm_vmexit_spec_ctrl, X86_FEATURE_SC_MSR_HVM /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ + /* + * STGI is executed unconditionally, and is sufficiently serialising + * to safely resolve any Spectre-v1 concerns in the above logic. + */ STGI GLOBAL(svm_stgi_label) mov %rsp,%rdi diff -Nru xen-4.14.4+74-gd7b22226b5/xen/arch/x86/hvm/svm/vmcb.c xen-4.14.5+24-g87d90d511c/xen/arch/x86/hvm/svm/vmcb.c --- xen-4.14.4+74-gd7b22226b5/xen/arch/x86/hvm/svm/vmcb.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/arch/x86/hvm/svm/vmcb.c 2022-07-12 15:31:49.000000000 +0000 @@ -29,6 +29,7 @@ #include #include #include +#include struct vmcb_struct *alloc_vmcb(void) { @@ -175,6 +176,14 @@ vmcb->_pause_filter_thresh = SVM_PAUSETHRESH_INIT; } + /* + * When default_xen_spec_ctrl simply SPEC_CTRL_STIBP, default this behind + * the back of the VM too. Our SMT topology isn't accurate, the overhead + * is neglegable, and doing this saves a WRMSR on the vmentry path. + */ + if ( default_xen_spec_ctrl == SPEC_CTRL_STIBP ) + v->arch.msrs->spec_ctrl.raw = SPEC_CTRL_STIBP; + return 0; } diff -Nru xen-4.14.4+74-gd7b22226b5/xen/arch/x86/hvm/vmx/entry.S xen-4.14.5+24-g87d90d511c/xen/arch/x86/hvm/vmx/entry.S --- xen-4.14.4+74-gd7b22226b5/xen/arch/x86/hvm/vmx/entry.S 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/arch/x86/hvm/vmx/entry.S 2022-07-12 15:31:49.000000000 +0000 @@ -87,7 +87,7 @@ /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ /* SPEC_CTRL_EXIT_TO_VMX Req: %rsp=regs/cpuinfo Clob: */ - ALTERNATIVE "", __stringify(verw CPUINFO_verw_sel(%rsp)), X86_FEATURE_SC_VERW_HVM + DO_SPEC_CTRL_COND_VERW mov VCPU_hvm_guest_cr2(%rbx),%rax diff -Nru xen-4.14.4+74-gd7b22226b5/xen/arch/x86/hvm/vmx/vmcs.c xen-4.14.5+24-g87d90d511c/xen/arch/x86/hvm/vmx/vmcs.c --- xen-4.14.4+74-gd7b22226b5/xen/arch/x86/hvm/vmx/vmcs.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/arch/x86/hvm/vmx/vmcs.c 2022-07-12 15:31:49.000000000 +0000 @@ -1332,6 +1332,10 @@ rc = vmx_add_msr(v, MSR_FLUSH_CMD, FLUSH_CMD_L1D, VMX_MSR_GUEST_LOADONLY); + if ( !rc && (d->arch.spec_ctrl_flags & SCF_entry_ibpb) ) + rc = vmx_add_msr(v, MSR_PRED_CMD, PRED_CMD_IBPB, + VMX_MSR_HOST); + out: vmx_vmcs_exit(v); diff -Nru xen-4.14.4+74-gd7b22226b5/xen/arch/x86/livepatch.c xen-4.14.5+24-g87d90d511c/xen/arch/x86/livepatch.c --- xen-4.14.4+74-gd7b22226b5/xen/arch/x86/livepatch.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/arch/x86/livepatch.c 2022-07-12 15:31:49.000000000 +0000 @@ -290,6 +290,13 @@ elf->name, symndx); return -EINVAL; } + else if ( elf->sym[symndx].ignored ) + { + printk(XENLOG_ERR LIVEPATCH + "%s: Relocation against ignored symbol %s cannot be resolved\n", + elf->name, elf->sym[symndx].name); + return -EINVAL; + } val = r->r_addend + elf->sym[symndx].sym->st_value; diff -Nru xen-4.14.4+74-gd7b22226b5/xen/arch/x86/mm.c xen-4.14.5+24-g87d90d511c/xen/arch/x86/mm.c --- xen-4.14.4+74-gd7b22226b5/xen/arch/x86/mm.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/arch/x86/mm.c 2022-07-12 15:31:49.000000000 +0000 @@ -785,24 +785,6 @@ return (page_get_owner(page) == dom_io); } -static int update_xen_mappings(unsigned long mfn, unsigned int cacheattr) -{ - int err = 0; - bool alias = mfn >= PFN_DOWN(xen_phys_start) && - mfn < PFN_UP(xen_phys_start + xen_virt_end - XEN_VIRT_START); - unsigned long xen_va = - XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT); - - if ( unlikely(alias) && cacheattr ) - err = map_pages_to_xen(xen_va, _mfn(mfn), 1, 0); - if ( !err ) - err = map_pages_to_xen((unsigned long)mfn_to_virt(mfn), _mfn(mfn), 1, - PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr)); - if ( unlikely(alias) && !cacheattr && !err ) - err = map_pages_to_xen(xen_va, _mfn(mfn), 1, PAGE_HYPERVISOR); - return err; -} - #ifndef NDEBUG struct mmio_emul_range_ctxt { const struct domain *d; @@ -1007,49 +989,25 @@ goto could_not_pin; } - if ( pte_flags_to_cacheattr(l1f) != - ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) ) + if ( (l1f & PAGE_CACHE_ATTRS) != _PAGE_WB && is_special_page(page) ) { - unsigned long x, nx, y = page->count_info; - unsigned long cacheattr = pte_flags_to_cacheattr(l1f); - int err; - - if ( is_special_page(page) ) - { - if ( write ) - put_page_type(page); - put_page(page); - gdprintk(XENLOG_WARNING, - "Attempt to change cache attributes of Xen heap page\n"); - return -EACCES; - } - - do { - x = y; - nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base); - } while ( (y = cmpxchg(&page->count_info, x, nx)) != x ); - - err = update_xen_mappings(mfn, cacheattr); - if ( unlikely(err) ) - { - cacheattr = y & PGC_cacheattr_mask; - do { - x = y; - nx = (x & ~PGC_cacheattr_mask) | cacheattr; - } while ( (y = cmpxchg(&page->count_info, x, nx)) != x ); - - if ( write ) - put_page_type(page); - put_page(page); - - gdprintk(XENLOG_WARNING, "Error updating mappings for mfn %" PRI_mfn - " (pfn %" PRI_pfn ", from L1 entry %" PRIpte ") for d%d\n", - mfn, get_gpfn_from_mfn(mfn), - l1e_get_intpte(l1e), l1e_owner->domain_id); - return err; - } + if ( write ) + put_page_type(page); + put_page(page); + gdprintk(XENLOG_WARNING, + "Attempt to change cache attributes of Xen heap page\n"); + return -EACCES; } + /* + * Track writeable non-coherent mappings to RAM pages, to trigger a cache + * flush later if the target is used as anything but a PGT_writeable page. + * We care about all writeable mappings, including foreign mappings. + */ + if ( !boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) && + (l1f & (PAGE_CACHE_ATTRS | _PAGE_RW)) == (_PAGE_WC | _PAGE_RW) ) + set_bit(_PGT_non_coherent, &page->u.inuse.type_info); + return 0; could_not_pin: @@ -2453,25 +2411,10 @@ */ static int cleanup_page_mappings(struct page_info *page) { - unsigned int cacheattr = - (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base; int rc = 0; unsigned long mfn = mfn_x(page_to_mfn(page)); /* - * If we've modified xen mappings as a result of guest cache - * attributes, restore them to the "normal" state. - */ - if ( unlikely(cacheattr) ) - { - page->count_info &= ~PGC_cacheattr_mask; - - BUG_ON(is_special_page(page)); - - rc = update_xen_mappings(mfn, 0); - } - - /* * If this may be in a PV domain's IOMMU, remove it. * * NB that writable xenheap pages have their type set and cleared by @@ -2510,6 +2453,19 @@ } } + /* + * Flush the cache if there were previously non-coherent writeable + * mappings of this page. This forces the page to be coherent before it + * is freed back to the heap. + */ + if ( __test_and_clear_bit(_PGT_non_coherent, &page->u.inuse.type_info) ) + { + void *addr = __map_domain_page(page); + + cache_flush(addr, PAGE_SIZE); + unmap_domain_page(addr); + } + return rc; } @@ -2892,16 +2848,17 @@ static int _get_page_type(struct page_info *page, unsigned long type, bool preemptible) { - unsigned long nx, x, y = page->u.inuse.type_info; + unsigned long nx, x; int rc = 0; ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2))); ASSERT(!in_irq()); - for ( ; ; ) + for ( unsigned long y = ACCESS_ONCE(page->u.inuse.type_info); ; ) { x = y; nx = x + 1; + if ( unlikely((nx & PGT_count_mask) == 0) ) { gdprintk(XENLOG_WARNING, @@ -2909,63 +2866,33 @@ mfn_x(page_to_mfn(page))); return -EINVAL; } - else if ( unlikely((x & PGT_count_mask) == 0) ) - { - struct domain *d = page_get_owner(page); - if ( d && shadow_mode_enabled(d) ) - shadow_prepare_page_type_change(d, page, type); + if ( unlikely((x & PGT_count_mask) == 0) ) + { + /* + * Typeref 0 -> 1. + * + * Type changes are permitted when the typeref is 0. If the type + * actually changes, the page needs re-validating. + */ ASSERT(!(x & PGT_pae_xen_l2)); if ( (x & PGT_type_mask) != type ) { - /* - * On type change we check to flush stale TLB entries. It is - * vital that no other CPUs are left with mappings of a frame - * which is about to become writeable to the guest. - */ - cpumask_t *mask = this_cpu(scratch_cpumask); - - BUG_ON(in_irq()); - cpumask_copy(mask, d->dirty_cpumask); - - /* Don't flush if the timestamp is old enough */ - tlbflush_filter(mask, page->tlbflush_timestamp); - - if ( unlikely(!cpumask_empty(mask)) && - /* Shadow mode: track only writable pages. */ - (!shadow_mode_enabled(page_get_owner(page)) || - ((nx & PGT_type_mask) == PGT_writable_page)) ) - { - perfc_incr(need_flush_tlb_flush); - /* - * If page was a page table make sure the flush is - * performed using an IPI in order to avoid changing the - * type of a page table page under the feet of - * spurious_page_fault(). - */ - flush_mask(mask, - (x & PGT_type_mask) && - (x & PGT_type_mask) <= PGT_root_page_table - ? FLUSH_TLB | FLUSH_FORCE_IPI - : FLUSH_TLB); - } - - /* We lose existing type and validity. */ nx &= ~(PGT_type_mask | PGT_validated); nx |= type; - - /* - * No special validation needed for writable pages. - * Page tables and GDT/LDT need to be scanned for validity. - */ - if ( type == PGT_writable_page || type == PGT_shared_page ) - nx |= PGT_validated; } } else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) ) { - /* Don't log failure if it could be a recursive-mapping attempt. */ + /* + * else, we're trying to take a new reference, of the wrong type. + * + * This (being able to prohibit use of the wrong type) is what the + * typeref system exists for, but skip printing the failure if it + * looks like a recursive mapping, as subsequent logic might + * ultimately permit the attempt. + */ if ( ((x & PGT_type_mask) == PGT_l2_page_table) && (type == PGT_l1_page_table) ) return -EINVAL; @@ -2984,18 +2911,47 @@ } else if ( unlikely(!(x & PGT_validated)) ) { + /* + * else, the count is non-zero, and we're grabbing the right type; + * but the page hasn't been validated yet. + * + * The page is in one of two states (depending on PGT_partial), + * and should have exactly one reference. + */ + ASSERT((x & (PGT_type_mask | PGT_pae_xen_l2 | PGT_count_mask)) == + (type | 1)); + if ( !(x & PGT_partial) ) { - /* Someone else is updating validation of this page. Wait... */ + /* + * The page has been left in the "validate locked" state + * (i.e. PGT_[type] | 1) which means that a concurrent caller + * of _get_page_type() is in the middle of validation. + * + * Spin waiting for the concurrent user to complete (partial + * or fully validated), then restart our attempt to acquire a + * type reference. + */ do { if ( preemptible && hypercall_preempt_check() ) return -EINTR; cpu_relax(); - } while ( (y = page->u.inuse.type_info) == x ); + } while ( (y = ACCESS_ONCE(page->u.inuse.type_info)) == x ); continue; } - /* Type ref count was left at 1 when PGT_partial got set. */ - ASSERT((x & PGT_count_mask) == 1); + + /* + * The page has been left in the "partial" state + * (i.e., PGT_[type] | PGT_partial | 1). + * + * Rather than bumping the type count, we need to try to grab the + * validation lock; if we succeed, we need to validate the page, + * then drop the general ref associated with the PGT_partial bit. + * + * We grab the validation lock by setting nx to (PGT_[type] | 1) + * (i.e., non-zero type count, neither PGT_validated nor + * PGT_partial set). + */ nx = x & ~PGT_partial; } @@ -3006,6 +2962,56 @@ return -EINTR; } + /* + * One typeref has been taken and is now globally visible. + * + * The page is either in the "validate locked" state (PGT_[type] | 1) or + * fully validated (PGT_[type] | PGT_validated | >0). + */ + + if ( unlikely((x & PGT_count_mask) == 0) ) + { + struct domain *d = page_get_owner(page); + + if ( d && shadow_mode_enabled(d) ) + shadow_prepare_page_type_change(d, page, type); + + if ( (x & PGT_type_mask) != type ) + { + /* + * On type change we check to flush stale TLB entries. It is + * vital that no other CPUs are left with writeable mappings + * to a frame which is intending to become pgtable/segdesc. + */ + cpumask_t *mask = this_cpu(scratch_cpumask); + + BUG_ON(in_irq()); + cpumask_copy(mask, d->dirty_cpumask); + + /* Don't flush if the timestamp is old enough */ + tlbflush_filter(mask, page->tlbflush_timestamp); + + if ( unlikely(!cpumask_empty(mask)) && + /* Shadow mode: track only writable pages. */ + (!shadow_mode_enabled(d) || + ((nx & PGT_type_mask) == PGT_writable_page)) ) + { + perfc_incr(need_flush_tlb_flush); + /* + * If page was a page table make sure the flush is + * performed using an IPI in order to avoid changing the + * type of a page table page under the feet of + * spurious_page_fault(). + */ + flush_mask(mask, + (x & PGT_type_mask) && + (x & PGT_type_mask) <= PGT_root_page_table + ? FLUSH_TLB | FLUSH_FORCE_IPI + : FLUSH_TLB); + } + } + } + if ( unlikely(((x & PGT_type_mask) == PGT_writable_page) != (type == PGT_writable_page)) ) { @@ -3032,16 +3038,51 @@ if ( unlikely(!(nx & PGT_validated)) ) { - if ( !(x & PGT_partial) ) + /* + * Flush the cache if there were previously non-coherent mappings of + * this page, and we're trying to use it as anything other than a + * writeable page. This forces the page to be coherent before we + * validate its contents for safety. + */ + if ( (nx & PGT_non_coherent) && type != PGT_writable_page ) + { + void *addr = __map_domain_page(page); + + cache_flush(addr, PAGE_SIZE); + unmap_domain_page(addr); + + page->u.inuse.type_info &= ~PGT_non_coherent; + } + + /* + * No special validation needed for writable or shared pages. Page + * tables and GDT/LDT need to have their contents audited. + * + * per validate_page(), non-atomic updates are fine here. + */ + if ( type == PGT_writable_page || type == PGT_shared_page ) + page->u.inuse.type_info |= PGT_validated; + else { - page->nr_validated_ptes = 0; - page->partial_flags = 0; - page->linear_pt_count = 0; + if ( !(x & PGT_partial) ) + { + page->nr_validated_ptes = 0; + page->partial_flags = 0; + page->linear_pt_count = 0; + } + + rc = validate_page(page, type, preemptible); } - rc = validate_page(page, type, preemptible); } out: + /* + * Did we drop the PGT_partial bit when acquiring the typeref? If so, + * drop the general reference that went along with it. + * + * N.B. validate_page() may have have re-set PGT_partial, not reflected in + * nx, but will have taken an extra ref when doing so. + */ if ( (x & PGT_partial) && !(nx & PGT_partial) ) put_page(page); diff -Nru xen-4.14.4+74-gd7b22226b5/xen/arch/x86/pv/grant_table.c xen-4.14.5+24-g87d90d511c/xen/arch/x86/pv/grant_table.c --- xen-4.14.4+74-gd7b22226b5/xen/arch/x86/pv/grant_table.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/arch/x86/pv/grant_table.c 2022-07-12 15:31:49.000000000 +0000 @@ -109,7 +109,17 @@ ol1e = *pl1e; if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) ) + { + /* + * We always create mappings in this path. However, our caller, + * map_grant_ref(), only passes potentially non-zero cache_flags for + * MMIO frames, so this path doesn't create non-coherent mappings of + * RAM frames and there's no need to calculate PGT_non_coherent. + */ + ASSERT(!cache_flags || is_iomem_page(frame)); + rc = GNTST_okay; + } out_unlock: page_unlock(page); @@ -294,7 +304,18 @@ l1e_get_flags(ol1e), addr, grant_pte_flags); if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) ) + { + /* + * Generally, replace_grant_pv_mapping() is used to destroy mappings + * (n1le = l1e_empty()), but it can be a present mapping on the + * GNTABOP_unmap_and_replace path. + * + * In such cases, the PTE is fully transplanted from its old location + * via steal_linear_addr(), so we need not perform PGT_non_coherent + * checking here. + */ rc = GNTST_okay; + } out_unlock: page_unlock(page); diff -Nru xen-4.14.4+74-gd7b22226b5/xen/arch/x86/spec_ctrl.c xen-4.14.5+24-g87d90d511c/xen/arch/x86/spec_ctrl.c --- xen-4.14.4+74-gd7b22226b5/xen/arch/x86/spec_ctrl.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/arch/x86/spec_ctrl.c 2022-07-12 15:31:49.000000000 +0000 @@ -36,8 +36,12 @@ static bool __initdata opt_msr_sc_hvm = true; static bool __initdata opt_rsb_pv = true; static bool __initdata opt_rsb_hvm = true; -static int8_t __initdata opt_md_clear_pv = -1; -static int8_t __initdata opt_md_clear_hvm = -1; +static int8_t __read_mostly opt_md_clear_pv = -1; +static int8_t __read_mostly opt_md_clear_hvm = -1; + +static int8_t __read_mostly opt_ibpb_entry_pv = -1; +static int8_t __read_mostly opt_ibpb_entry_hvm = -1; +static bool __read_mostly opt_ibpb_entry_dom0; /* Cmdline controls for Xen's speculative settings. */ static enum ind_thunk { @@ -48,9 +52,13 @@ THUNK_LFENCE, THUNK_JMP, } opt_thunk __initdata = THUNK_DEFAULT; + static int8_t __initdata opt_ibrs = -1; -bool __read_mostly opt_ibpb = true; -bool __read_mostly opt_ssbd = false; +int8_t __initdata opt_stibp = -1; +bool __read_mostly opt_ssbd; +int8_t __initdata opt_psfd = -1; + +int8_t __read_mostly opt_ibpb_ctxt_switch = -1; int8_t __read_mostly opt_eager_fpu = -1; int8_t __read_mostly opt_l1d_flush = -1; bool __read_mostly opt_branch_harden = true; @@ -67,6 +75,8 @@ static bool __initdata cpu_has_bug_mds; /* Any other M{LP,SB,FB}DS combination. */ static int8_t __initdata opt_srb_lock = -1; +static bool __initdata opt_unpriv_mmio; +static bool __read_mostly opt_fb_clear_mmio; static int __init parse_spec_ctrl(const char *s) { @@ -108,14 +118,18 @@ opt_rsb_hvm = false; opt_md_clear_pv = 0; opt_md_clear_hvm = 0; + opt_ibpb_entry_pv = 0; + opt_ibpb_entry_hvm = 0; + opt_ibpb_entry_dom0 = false; opt_thunk = THUNK_JMP; opt_ibrs = 0; - opt_ibpb = false; + opt_ibpb_ctxt_switch = false; opt_ssbd = false; opt_l1d_flush = 0; opt_branch_harden = false; opt_srb_lock = 0; + opt_unpriv_mmio = false; } else if ( val > 0 ) rc = -EINVAL; @@ -133,27 +147,99 @@ opt_msr_sc_pv = val; opt_rsb_pv = val; opt_md_clear_pv = val; + opt_ibpb_entry_pv = val; } else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) { opt_msr_sc_hvm = val; opt_rsb_hvm = val; opt_md_clear_hvm = val; + opt_ibpb_entry_hvm = val; } - else if ( (val = parse_boolean("msr-sc", s, ss)) >= 0 ) + else if ( (val = parse_boolean("msr-sc", s, ss)) != -1 ) { - opt_msr_sc_pv = val; - opt_msr_sc_hvm = val; + switch ( val ) + { + case 0: + case 1: + opt_msr_sc_pv = opt_msr_sc_hvm = val; + break; + + case -2: + s += strlen("msr-sc="); + if ( (val = parse_boolean("pv", s, ss)) >= 0 ) + opt_msr_sc_pv = val; + else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) + opt_msr_sc_hvm = val; + else + default: + rc = -EINVAL; + break; + } } - else if ( (val = parse_boolean("rsb", s, ss)) >= 0 ) + else if ( (val = parse_boolean("rsb", s, ss)) != -1 ) { - opt_rsb_pv = val; - opt_rsb_hvm = val; + switch ( val ) + { + case 0: + case 1: + opt_rsb_pv = opt_rsb_hvm = val; + break; + + case -2: + s += strlen("rsb="); + if ( (val = parse_boolean("pv", s, ss)) >= 0 ) + opt_rsb_pv = val; + else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) + opt_rsb_hvm = val; + else + default: + rc = -EINVAL; + break; + } } - else if ( (val = parse_boolean("md-clear", s, ss)) >= 0 ) + else if ( (val = parse_boolean("md-clear", s, ss)) != -1 ) { - opt_md_clear_pv = val; - opt_md_clear_hvm = val; + switch ( val ) + { + case 0: + case 1: + opt_md_clear_pv = opt_md_clear_hvm = val; + break; + + case -2: + s += strlen("md-clear="); + if ( (val = parse_boolean("pv", s, ss)) >= 0 ) + opt_md_clear_pv = val; + else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) + opt_md_clear_hvm = val; + else + default: + rc = -EINVAL; + break; + } + } + else if ( (val = parse_boolean("ibpb-entry", s, ss)) != -1 ) + { + switch ( val ) + { + case 0: + case 1: + opt_ibpb_entry_pv = opt_ibpb_entry_hvm = + opt_ibpb_entry_dom0 = val; + break; + + case -2: + s += strlen("ibpb-entry="); + if ( (val = parse_boolean("pv", s, ss)) >= 0 ) + opt_ibpb_entry_pv = val; + else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) + opt_ibpb_entry_hvm = val; + else + default: + rc = -EINVAL; + break; + } } /* Xen's speculative sidechannel mitigation settings. */ @@ -170,12 +256,20 @@ else rc = -EINVAL; } + + /* Bits in MSR_SPEC_CTRL. */ else if ( (val = parse_boolean("ibrs", s, ss)) >= 0 ) opt_ibrs = val; - else if ( (val = parse_boolean("ibpb", s, ss)) >= 0 ) - opt_ibpb = val; + else if ( (val = parse_boolean("stibp", s, ss)) >= 0 ) + opt_stibp = val; else if ( (val = parse_boolean("ssbd", s, ss)) >= 0 ) opt_ssbd = val; + else if ( (val = parse_boolean("psfd", s, ss)) >= 0 ) + opt_psfd = val; + + /* Misc settings. */ + else if ( (val = parse_boolean("ibpb", s, ss)) >= 0 ) + opt_ibpb_ctxt_switch = val; else if ( (val = parse_boolean("eager-fpu", s, ss)) >= 0 ) opt_eager_fpu = val; else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 ) @@ -184,6 +278,8 @@ opt_branch_harden = val; else if ( (val = parse_boolean("srb-lock", s, ss)) >= 0 ) opt_srb_lock = val; + else if ( (val = parse_boolean("unpriv-mmio", s, ss)) >= 0 ) + opt_unpriv_mmio = val; else rc = -EINVAL; @@ -323,7 +419,7 @@ * Hardware read-only information, stating immunity to certain issues, or * suggestions of which mitigation to use. */ - printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s\n", + printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "", (caps & ARCH_CAPS_IBRS_ALL) ? " IBRS_ALL" : "", (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", @@ -332,13 +428,17 @@ (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : "", (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "", (caps & ARCH_CAPS_TAA_NO) ? " TAA_NO" : "", + (caps & ARCH_CAPS_SBDR_SSDP_NO) ? " SBDR_SSDP_NO" : "", + (caps & ARCH_CAPS_FBSDP_NO) ? " FBSDP_NO" : "", + (caps & ARCH_CAPS_PSDP_NO) ? " PSDP_NO" : "", (e8b & cpufeat_mask(X86_FEATURE_IBRS_ALWAYS)) ? " IBRS_ALWAYS" : "", (e8b & cpufeat_mask(X86_FEATURE_STIBP_ALWAYS)) ? " STIBP_ALWAYS" : "", (e8b & cpufeat_mask(X86_FEATURE_IBRS_FAST)) ? " IBRS_FAST" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS_SAME_MODE)) ? " IBRS_SAME_MODE" : ""); + (e8b & cpufeat_mask(X86_FEATURE_IBRS_SAME_MODE)) ? " IBRS_SAME_MODE" : "", + (e8b & cpufeat_mask(X86_FEATURE_BTC_NO)) ? " BTC_NO" : ""); /* Hardware features which need driving to mitigate issues. */ - printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s\n", + printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n", (e8b & cpufeat_mask(X86_FEATURE_IBPB)) || (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBPB" : "", (e8b & cpufeat_mask(X86_FEATURE_IBRS)) || @@ -353,7 +453,9 @@ (_7d0 & cpufeat_mask(X86_FEATURE_MD_CLEAR)) ? " MD_CLEAR" : "", (_7d0 & cpufeat_mask(X86_FEATURE_SRBDS_CTRL)) ? " SRBDS_CTRL" : "", (e8b & cpufeat_mask(X86_FEATURE_VIRT_SSBD)) ? " VIRT_SSBD" : "", - (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : ""); + (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "", + (caps & ARCH_CAPS_FB_CLEAR) ? " FB_CLEAR" : "", + (caps & ARCH_CAPS_FB_CLEAR_CTRL) ? " FB_CLEAR_CTRL" : ""); /* Compiled-in support which pertains to mitigations. */ if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) ) @@ -367,7 +469,7 @@ "\n"); /* Settings for Xen's protection, irrespective of guests. */ - printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s, Other:%s%s%s%s%s\n", + printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s\n", thunk == THUNK_NONE ? "N/A" : thunk == THUNK_RETPOLINE ? "RETPOLINE" : thunk == THUNK_LFENCE ? "LFENCE" : @@ -381,13 +483,17 @@ (!boot_cpu_has(X86_FEATURE_SSBD) && !boot_cpu_has(X86_FEATURE_AMD_SSBD)) ? "" : (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-", + (!boot_cpu_has(X86_FEATURE_PSFD) && + !boot_cpu_has(X86_FEATURE_INTEL_PSFD)) ? "" : + (default_xen_spec_ctrl & SPEC_CTRL_PSFD) ? " PSFD+" : " PSFD-", !(caps & ARCH_CAPS_TSX_CTRL) ? "" : (opt_tsx & 1) ? " TSX+" : " TSX-", !cpu_has_srbds_ctrl ? "" : opt_srb_lock ? " SRB_LOCK+" : " SRB_LOCK-", - opt_ibpb ? " IBPB" : "", + opt_ibpb_ctxt_switch ? " IBPB-ctxt" : "", opt_l1d_flush ? " L1D_FLUSH" : "", - opt_md_clear_pv || opt_md_clear_hvm ? " VERW" : "", + opt_md_clear_pv || opt_md_clear_hvm || + opt_fb_clear_mmio ? " VERW" : "", opt_branch_harden ? " BRANCH_HARDEN" : ""); /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */ @@ -402,27 +508,31 @@ * mitigation support for guests. */ #ifdef CONFIG_HVM - printk(" Support for HVM VMs:%s%s%s%s%s\n", + printk(" Support for HVM VMs:%s%s%s%s%s%s\n", (boot_cpu_has(X86_FEATURE_SC_MSR_HVM) || boot_cpu_has(X86_FEATURE_SC_RSB_HVM) || boot_cpu_has(X86_FEATURE_MD_CLEAR) || + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) || opt_eager_fpu) ? "" : " None", boot_cpu_has(X86_FEATURE_SC_MSR_HVM) ? " MSR_SPEC_CTRL" : "", boot_cpu_has(X86_FEATURE_SC_RSB_HVM) ? " RSB" : "", opt_eager_fpu ? " EAGER_FPU" : "", - boot_cpu_has(X86_FEATURE_MD_CLEAR) ? " MD_CLEAR" : ""); + boot_cpu_has(X86_FEATURE_MD_CLEAR) ? " MD_CLEAR" : "", + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) ? " IBPB-entry" : ""); #endif #ifdef CONFIG_PV - printk(" Support for PV VMs:%s%s%s%s%s\n", + printk(" Support for PV VMs:%s%s%s%s%s%s\n", (boot_cpu_has(X86_FEATURE_SC_MSR_PV) || boot_cpu_has(X86_FEATURE_SC_RSB_PV) || boot_cpu_has(X86_FEATURE_MD_CLEAR) || + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) || opt_eager_fpu) ? "" : " None", boot_cpu_has(X86_FEATURE_SC_MSR_PV) ? " MSR_SPEC_CTRL" : "", boot_cpu_has(X86_FEATURE_SC_RSB_PV) ? " RSB" : "", opt_eager_fpu ? " EAGER_FPU" : "", - boot_cpu_has(X86_FEATURE_MD_CLEAR) ? " MD_CLEAR" : ""); + boot_cpu_has(X86_FEATURE_MD_CLEAR) ? " MD_CLEAR" : "", + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) ? " IBPB-entry" : ""); printk(" XPTI (64-bit PV only): Dom0 %s, DomU %s (with%s PCID)\n", opt_xpti_hwdom ? "enabled" : "disabled", @@ -655,6 +765,55 @@ } } +static void __init ibpb_calculations(void) +{ + /* Check we have hardware IBPB support before using it... */ + if ( !boot_cpu_has(X86_FEATURE_IBRSB) && !boot_cpu_has(X86_FEATURE_IBPB) ) + { + opt_ibpb_entry_hvm = opt_ibpb_entry_pv = opt_ibpb_ctxt_switch = 0; + opt_ibpb_entry_dom0 = false; + return; + } + + /* + * IBPB-on-entry mitigations for Branch Type Confusion. + * + * IBPB && !BTC_NO selects all AMD/Hygon hardware, not known to be safe, + * that we can provide some form of mitigation on. + */ + if ( opt_ibpb_entry_pv == -1 ) + opt_ibpb_entry_pv = (IS_ENABLED(CONFIG_PV) && + boot_cpu_has(X86_FEATURE_IBPB) && + !boot_cpu_has(X86_FEATURE_BTC_NO)); + if ( opt_ibpb_entry_hvm == -1 ) + opt_ibpb_entry_hvm = (IS_ENABLED(CONFIG_HVM) && + boot_cpu_has(X86_FEATURE_IBPB) && + !boot_cpu_has(X86_FEATURE_BTC_NO)); + + if ( opt_ibpb_entry_pv ) + { + setup_force_cpu_cap(X86_FEATURE_IBPB_ENTRY_PV); + + /* + * We only need to flush in IST context if we're protecting against PV + * guests. HVM IBPB-on-entry protections are both atomic with + * NMI/#MC, so can't interrupt Xen ahead of having already flushed the + * BTB. + */ + default_spec_ctrl_flags |= SCF_ist_ibpb; + } + if ( opt_ibpb_entry_hvm ) + setup_force_cpu_cap(X86_FEATURE_IBPB_ENTRY_HVM); + + /* + * If we're using IBPB-on-entry to protect against PV and HVM guests + * (ignoring dom0 if trusted), then there's no need to also issue IBPB on + * context switch too. + */ + if ( opt_ibpb_ctxt_switch == -1 ) + opt_ibpb_ctxt_switch = !(opt_ibpb_entry_hvm && opt_ibpb_entry_pv); +} + /* Calculate whether this CPU is vulnerable to L1TF. */ static __init void l1tf_calculations(uint64_t caps) { @@ -903,6 +1062,22 @@ } } +void spec_ctrl_init_domain(struct domain *d) +{ + bool pv = is_pv_domain(d); + + bool verw = ((pv ? opt_md_clear_pv : opt_md_clear_hvm) || + (opt_fb_clear_mmio && is_iommu_enabled(d))); + + bool ibpb = ((pv ? opt_ibpb_entry_pv : opt_ibpb_entry_hvm) && + (d->domain_id != 0 || opt_ibpb_entry_dom0)); + + d->arch.spec_ctrl_flags = + (verw ? SCF_verw : 0) | + (ibpb ? SCF_entry_ibpb : 0) | + 0; +} + void __init init_speculation_mitigations(void) { enum ind_thunk thunk = THUNK_DEFAULT; @@ -931,10 +1106,7 @@ if ( !has_spec_ctrl ) printk(XENLOG_WARNING "?!? CET active, but no MSR_SPEC_CTRL?\n"); else if ( opt_ibrs == -1 ) - { opt_ibrs = ibrs = true; - default_xen_spec_ctrl |= SPEC_CTRL_IBRS | SPEC_CTRL_STIBP; - } if ( opt_thunk == THUNK_DEFAULT || opt_thunk == THUNK_RETPOLINE ) thunk = THUNK_JMP; @@ -1003,7 +1175,7 @@ { if ( opt_msr_sc_pv ) { - default_spec_ctrl_flags |= SCF_ist_wrmsr; + default_spec_ctrl_flags |= SCF_ist_sc_msr; setup_force_cpu_cap(X86_FEATURE_SC_MSR_PV); } @@ -1014,7 +1186,7 @@ * Xen's value is not restored atomically. An early NMI hitting * the VMExit path needs to restore Xen's value for safety. */ - default_spec_ctrl_flags |= SCF_ist_wrmsr; + default_spec_ctrl_flags |= SCF_ist_sc_msr; setup_force_cpu_cap(X86_FEATURE_SC_MSR_HVM); } } @@ -1027,7 +1199,7 @@ * on real hardware matches the availability of MSR_SPEC_CTRL in the * first place. * - * No need for SCF_ist_wrmsr because Xen's value is restored + * No need for SCF_ist_sc_msr because Xen's value is restored * atomically WRT NMIs in the VMExit path. * * TODO: Adjust cpu_has_svm_spec_ctrl to be usable earlier on boot. @@ -1038,14 +1210,52 @@ setup_force_cpu_cap(X86_FEATURE_SC_MSR_HVM); } - /* If we have IBRS available, see whether we should use it. */ + /* Figure out default_xen_spec_ctrl. */ if ( has_spec_ctrl && ibrs ) + { + /* IBRS implies STIBP. */ + if ( opt_stibp == -1 ) + opt_stibp = 1; + default_xen_spec_ctrl |= SPEC_CTRL_IBRS; + } + + /* + * Use STIBP by default on all AMD systems. Zen3 and later enumerate + * STIBP_ALWAYS, but STIBP is needed on Zen2 as part of the mitigations + * for Branch Type Confusion. + * + * Leave STIBP off by default on Intel. Pre-eIBRS systems suffer a + * substantial perf hit when it was implemented in microcode. + */ + if ( opt_stibp == -1 ) + opt_stibp = !!boot_cpu_has(X86_FEATURE_AMD_STIBP); + + if ( opt_stibp && (boot_cpu_has(X86_FEATURE_STIBP) || + boot_cpu_has(X86_FEATURE_AMD_STIBP)) ) + default_xen_spec_ctrl |= SPEC_CTRL_STIBP; - /* If we have SSBD available, see whether we should use it. */ if ( opt_ssbd && (boot_cpu_has(X86_FEATURE_SSBD) || boot_cpu_has(X86_FEATURE_AMD_SSBD)) ) + { + /* SSBD implies PSFD */ + if ( opt_psfd == -1 ) + opt_psfd = 1; + default_xen_spec_ctrl |= SPEC_CTRL_SSBD; + } + + /* + * Don't use PSFD by default. AMD designed the predictor to + * auto-clear on privilege change. PSFD is implied by SSBD, which is + * off by default. + */ + if ( opt_psfd == -1 ) + opt_psfd = 0; + + if ( opt_psfd && (boot_cpu_has(X86_FEATURE_PSFD) || + boot_cpu_has(X86_FEATURE_INTEL_PSFD)) ) + default_xen_spec_ctrl |= SPEC_CTRL_PSFD; /* * PV guests can poison the RSB to any virtual address from which @@ -1073,9 +1283,7 @@ if ( opt_rsb_hvm ) setup_force_cpu_cap(X86_FEATURE_SC_RSB_HVM); - /* Check we have hardware IBPB support before using it... */ - if ( !boot_cpu_has(X86_FEATURE_IBRSB) && !boot_cpu_has(X86_FEATURE_IBPB) ) - opt_ibpb = false; + ibpb_calculations(); /* Check whether Eager FPU should be enabled by default. */ if ( opt_eager_fpu == -1 ) @@ -1084,8 +1292,14 @@ /* (Re)init BSP state now that default_spec_ctrl_flags has been calculated. */ init_shadow_spec_ctrl_state(); - /* If Xen is using any MSR_SPEC_CTRL settings, adjust the idle path. */ - if ( default_xen_spec_ctrl ) + /* + * For microcoded IBRS only (i.e. Intel, pre eIBRS), it is recommended to + * clear MSR_SPEC_CTRL before going idle, to avoid impacting sibling + * threads. Activate this if SMT is enabled, and Xen is using a non-zero + * MSR_SPEC_CTRL setting. + */ + if ( boot_cpu_has(X86_FEATURE_IBRSB) && !(caps & ARCH_CAPS_IBRS_ALL) && + hw_smt_enabled && default_xen_spec_ctrl ) setup_force_cpu_cap(X86_FEATURE_SC_MSR_IDLE); xpti_init_default(caps); @@ -1136,6 +1350,18 @@ mds_calculations(caps); /* + * Parts which enumerate FB_CLEAR are those which are post-MDS_NO and have + * reintroduced the VERW fill buffer flushing side effect because of a + * susceptibility to FBSDP. + * + * If unprivileged guests have (or will have) MMIO mappings, we can + * mitigate cross-domain leakage of fill buffer data by issuing VERW on + * the return-to-guest path. + */ + if ( opt_unpriv_mmio ) + opt_fb_clear_mmio = caps & ARCH_CAPS_FB_CLEAR; + + /* * By default, enable PV and HVM mitigations on MDS-vulnerable hardware. * This will only be a token effort for MLPDS/MFBDS when HT is enabled, * but it is somewhat better than nothing. @@ -1148,21 +1374,22 @@ boot_cpu_has(X86_FEATURE_MD_CLEAR)); /* - * Enable MDS defences as applicable. The PV blocks need using all the - * time, and the Idle blocks need using if either PV or HVM defences are - * used. + * Enable MDS/MMIO defences as applicable. The Idle blocks need using if + * either the PV or HVM MDS defences are used, or if we may give MMIO + * access to untrusted guests. * * HVM is more complicated. The MD_CLEAR microcode extends L1D_FLUSH with - * equivelent semantics to avoid needing to perform both flushes on the - * HVM path. The HVM blocks don't need activating if our hypervisor told - * us it was handling L1D_FLUSH, or we are using L1D_FLUSH ourselves. - */ - if ( opt_md_clear_pv ) - setup_force_cpu_cap(X86_FEATURE_SC_VERW_PV); - if ( opt_md_clear_pv || opt_md_clear_hvm ) + * equivalent semantics to avoid needing to perform both flushes on the + * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH (for + * MDS mitigations. L1D_FLUSH is not safe for MMIO mitigations.) + * + * After calculating the appropriate idle setting, simplify + * opt_md_clear_hvm to mean just "should we VERW on the way into HVM + * guests", so spec_ctrl_init_domain() can calculate suitable settings. + */ + if ( opt_md_clear_pv || opt_md_clear_hvm || opt_fb_clear_mmio ) setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE); - if ( opt_md_clear_hvm && !(caps & ARCH_CAPS_SKIP_L1DFL) && !opt_l1d_flush ) - setup_force_cpu_cap(X86_FEATURE_SC_VERW_HVM); + opt_md_clear_hvm &= !(caps & ARCH_CAPS_SKIP_L1DFL) && !opt_l1d_flush; /* * Warn the user if they are on MLPDS/MFBDS-vulnerable hardware with HT @@ -1225,14 +1452,19 @@ * On some SRBDS-affected hardware, it may be safe to relax srb-lock by * default. * - * On parts which enumerate MDS_NO and not TAA_NO, TSX is the only known - * way to access the Fill Buffer. If TSX isn't available (inc. SKU - * reasons on some models), or TSX is explicitly disabled, then there is - * no need for the extra overhead to protect RDRAND/RDSEED. + * All parts with SRBDS_CTRL suffer SSDP, the mechanism by which stale RNG + * data becomes available to other contexts. To recover the data, an + * attacker needs to use: + * - SBDS (MDS or TAA to sample the cores fill buffer) + * - SBDR (Architecturally retrieve stale transaction buffer contents) + * - DRPW (Architecturally latch stale fill buffer data) + * + * On MDS_NO parts, and with TAA_NO or TSX unavailable/disabled, and there + * is no unprivileged MMIO access, the RNG data doesn't need protecting. */ if ( cpu_has_srbds_ctrl ) { - if ( opt_srb_lock == -1 && + if ( opt_srb_lock == -1 && !opt_unpriv_mmio && (caps & (ARCH_CAPS_MDS_NO|ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO && (!cpu_has_hle || ((caps & ARCH_CAPS_TSX_CTRL) && rtm_disabled)) ) opt_srb_lock = 0; diff -Nru xen-4.14.4+74-gd7b22226b5/xen/arch/x86/x86_64/compat/entry.S xen-4.14.5+24-g87d90d511c/xen/arch/x86/x86_64/compat/entry.S --- xen-4.14.4+74-gd7b22226b5/xen/arch/x86/x86_64/compat/entry.S 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/arch/x86/x86_64/compat/entry.S 2022-07-12 15:31:49.000000000 +0000 @@ -18,7 +18,7 @@ movl $HYPERCALL_VECTOR, 4(%rsp) SAVE_ALL compat=1 /* DPL1 gate, restricted to 32bit PV guests only. */ - SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, Clob: acd */ + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ CR4_PV32_RESTORE @@ -212,7 +212,7 @@ movl $TRAP_syscall, 4(%rsp) SAVE_ALL - SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, Clob: acd */ + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ GET_STACK_END(bx) diff -Nru xen-4.14.4+74-gd7b22226b5/xen/arch/x86/x86_64/entry.S xen-4.14.5+24-g87d90d511c/xen/arch/x86/x86_64/entry.S --- xen-4.14.4+74-gd7b22226b5/xen/arch/x86/x86_64/entry.S 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/arch/x86/x86_64/entry.S 2022-07-12 15:31:49.000000000 +0000 @@ -248,7 +248,7 @@ movl $TRAP_syscall, 4(%rsp) SAVE_ALL - SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, Clob: acd */ + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ GET_STACK_END(bx) @@ -287,7 +287,7 @@ movl $TRAP_syscall, 4(%rsp) SAVE_ALL - SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, Clob: acd */ + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ GET_STACK_END(bx) @@ -339,7 +339,7 @@ movl $0x80, 4(%rsp) SAVE_ALL - SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, Clob: acd */ + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ GET_STACK_END(bx) @@ -600,7 +600,7 @@ GET_STACK_END(14) - SPEC_CTRL_ENTRY_FROM_INTR /* Req: %rsp=regs, %r14=end, Clob: acd */ + SPEC_CTRL_ENTRY_FROM_INTR /* Req: %rsp=regs, %r14=end, %rdx=0, Clob: acd */ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx @@ -633,7 +633,7 @@ GET_STACK_END(14) - SPEC_CTRL_ENTRY_FROM_INTR /* Req: %rsp=regs, %r14=end, Clob: acd */ + SPEC_CTRL_ENTRY_FROM_INTR /* Req: %rsp=regs, %r14=end, %rdx=0, Clob: acd */ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx @@ -854,7 +854,7 @@ GET_STACK_END(14) - SPEC_CTRL_ENTRY_FROM_INTR_IST /* Req: %rsp=regs, %r14=end, Clob: acd */ + SPEC_CTRL_ENTRY_FROM_INTR_IST /* Req: %rsp=regs, %r14=end, %rdx=0, Clob: abcd */ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rbx @@ -889,7 +889,7 @@ GET_STACK_END(14) - SPEC_CTRL_ENTRY_FROM_INTR_IST /* Req: %rsp=regs, %r14=end, Clob: acd */ + SPEC_CTRL_ENTRY_FROM_INTR_IST /* Req: %rsp=regs, %r14=end, %rdx=0, Clob: abcd */ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx diff -Nru xen-4.14.4+74-gd7b22226b5/xen/common/grant_table.c xen-4.14.5+24-g87d90d511c/xen/common/grant_table.c --- xen-4.14.4+74-gd7b22226b5/xen/common/grant_table.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/common/grant_table.c 2022-07-12 15:31:49.000000000 +0000 @@ -3440,7 +3440,7 @@ return 0; } -static int cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref) +static int _cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref) { struct domain *d, *owner; struct page_info *page; @@ -3534,7 +3534,7 @@ return -EFAULT; for ( ; ; ) { - int ret = cache_flush(&op, cur_ref); + int ret = _cache_flush(&op, cur_ref); if ( ret < 0 ) return ret; diff -Nru xen-4.14.4+74-gd7b22226b5/xen/common/kernel.c xen-4.14.5+24-g87d90d511c/xen/common/kernel.c --- xen-4.14.4+74-gd7b22226b5/xen/common/kernel.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/common/kernel.c 2022-07-12 15:31:49.000000000 +0000 @@ -272,9 +272,9 @@ int parse_boolean(const char *name, const char *s, const char *e) { size_t slen, nlen; - int val = !!strncmp(s, "no-", 3); + bool has_neg_prefix = !strncmp(s, "no-", 3); - if ( !val ) + if ( has_neg_prefix ) s += 3; slen = e ? ({ ASSERT(e >= s); e - s; }) : strlen(s); @@ -286,11 +286,23 @@ /* Exact, unadorned name? Result depends on the 'no-' prefix. */ if ( slen == nlen ) - return val; + return !has_neg_prefix; + + /* Inexact match with a 'no-' prefix? Not valid. */ + if ( has_neg_prefix ) + return -1; /* =$SOMETHING? Defer to the regular boolean parsing. */ if ( s[nlen] == '=' ) - return parse_bool(&s[nlen + 1], e); + { + int b = parse_bool(&s[nlen + 1], e); + + if ( b >= 0 ) + return b; + + /* Not a boolean, but the name matched. Signal specially. */ + return -2; + } /* Unrecognised. Give up. */ return -1; diff -Nru xen-4.14.4+74-gd7b22226b5/xen/common/livepatch.c xen-4.14.5+24-g87d90d511c/xen/common/livepatch.c --- xen-4.14.4+74-gd7b22226b5/xen/common/livepatch.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/common/livepatch.c 2022-07-12 15:31:49.000000000 +0000 @@ -301,9 +301,6 @@ * and .shstrtab. For the non-relocate we allocate and copy these * via other means - and the .rel we can ignore as we only use it * once during loading. - * - * Also ignore sections with zero size. Those can be for example: - * data, or .bss. */ if ( livepatch_elf_ignore_section(elf->sec[i].sec) ) offset[i] = UINT_MAX; @@ -362,8 +359,17 @@ else if ( elf->sec[i].sec->sh_flags & SHF_WRITE ) { buf = rw_buf; - rw_buf_sec = i; - rw_buf_cnt++; + if ( elf->sec[i].sec->sh_size ) + { + /* + * Special handling of RW empty regions: do not account for + * them in order to decide whether a patch can safely be + * re-applied, but assign them a load address so symbol + * resolution and relocations work. + */ + rw_buf_sec = i; + rw_buf_cnt++; + } } else buf = ro_buf; diff -Nru xen-4.14.4+74-gd7b22226b5/xen/common/livepatch_elf.c xen-4.14.5+24-g87d90d511c/xen/common/livepatch_elf.c --- xen-4.14.4+74-gd7b22226b5/xen/common/livepatch_elf.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/common/livepatch_elf.c 2022-07-12 15:31:49.000000000 +0000 @@ -334,7 +334,13 @@ } if ( livepatch_elf_ignore_section(elf->sec[idx].sec) ) + { + dprintk(XENLOG_DEBUG, LIVEPATCH + "%s: Symbol %s from section %s ignored\n", + elf->name, elf->sym[i].name, elf->sec[idx].name); + elf->sym[i].ignored = true; break; + } st_value += (unsigned long)elf->sec[idx].load_addr; if ( elf->sym[i].name ) diff -Nru xen-4.14.4+74-gd7b22226b5/xen/drivers/passthrough/vtd/extern.h xen-4.14.5+24-g87d90d511c/xen/drivers/passthrough/vtd/extern.h --- xen-4.14.4+74-gd7b22226b5/xen/drivers/passthrough/vtd/extern.h 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/drivers/passthrough/vtd/extern.h 2022-07-12 15:31:49.000000000 +0000 @@ -77,7 +77,6 @@ struct pci_dev *pdev, u16 did, u16 size, u64 addr); -unsigned int get_cache_line_size(void); void flush_all_cache(void); uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node); @@ -89,7 +88,7 @@ const struct pci_dev *pdev, domid_t domid, paddr_t pgd_maddr, unsigned int mode); int domain_context_unmap_one(struct domain *domain, struct vtd_iommu *iommu, - uint8_t bus, uint8_t devfn, domid_t domid); + uint8_t bus, uint8_t devfn); int intel_iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt); unsigned int io_apic_read_remap_rte(unsigned int apic, unsigned int reg); diff -Nru xen-4.14.4+74-gd7b22226b5/xen/drivers/passthrough/vtd/iommu.c xen-4.14.5+24-g87d90d511c/xen/drivers/passthrough/vtd/iommu.c --- xen-4.14.4+74-gd7b22226b5/xen/drivers/passthrough/vtd/iommu.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/drivers/passthrough/vtd/iommu.c 2022-07-12 15:31:49.000000000 +0000 @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -207,53 +208,10 @@ static void sync_cache(const void *addr, unsigned int size) { - static unsigned long clflush_size = 0; - const void *end = addr + size; - if ( !iommus_incoherent ) return; - if ( clflush_size == 0 ) - clflush_size = get_cache_line_size(); - - addr -= (unsigned long)addr & (clflush_size - 1); - for ( ; addr < end; addr += clflush_size ) -/* - * The arguments to a macro must not include preprocessor directives. Doing so - * results in undefined behavior, so we have to create some defines here in - * order to avoid it. - */ -#if defined(HAVE_AS_CLWB) -# define CLWB_ENCODING "clwb %[p]" -#elif defined(HAVE_AS_XSAVEOPT) -# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */ -#else -# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */ -#endif - -#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr)) -#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT) -# define INPUT BASE_INPUT -#else -# define INPUT(addr) "a" (addr), BASE_INPUT(addr) -#endif - /* - * Note regarding the use of NOP_DS_PREFIX: it's faster to do a clflush - * + prefix than a clflush + nop, and hence the prefix is added instead - * of letting the alternative framework fill the gap by appending nops. - */ - alternative_io_2(".byte " __stringify(NOP_DS_PREFIX) "; clflush %[p]", - "data16 clflush %[p]", /* clflushopt */ - X86_FEATURE_CLFLUSHOPT, - CLWB_ENCODING, - X86_FEATURE_CLWB, /* no outputs */, - INPUT(addr)); -#undef INPUT -#undef BASE_INPUT -#undef CLWB_ENCODING - - alternative_2("", "sfence", X86_FEATURE_CLFLUSHOPT, - "sfence", X86_FEATURE_CLWB); + cache_writeback(addr, size); } /* Allocate page table, return its machine address */ @@ -1530,7 +1488,7 @@ check_cleanup_domid_map(domain, pdev, iommu); printk(XENLOG_ERR "%04x:%02x:%02x.%u: unexpected context entry %016lx_%016lx (expected %016lx_%016lx)\n", - pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn), + seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), (uint64_t)(res >> 64), (uint64_t)res, (uint64_t)(old >> 64), (uint64_t)old); rc = -EILSEQ; @@ -1596,18 +1554,22 @@ if ( !seg && !rc ) rc = me_wifi_quirk(domain, bus, devfn, domid, pgd_maddr, mode); - if ( rc ) + if ( rc && !(mode & MAP_ERROR_RECOVERY) ) { - if ( !prev_dom ) - ret = domain_context_unmap_one(domain, iommu, bus, devfn, - DEVICE_DOMID(domain, pdev)); - else if ( prev_dom != domain ) /* Avoid infinite recursion. */ + if ( !prev_dom || + /* + * Unmapping here means DEV_TYPE_PCI devices with RMRRs (if such + * exist) would cause problems if such a region was actually + * accessed. + */ + (prev_dom == dom_io && !pdev) ) + ret = domain_context_unmap_one(domain, iommu, bus, devfn); + else ret = domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev, DEVICE_DOMID(prev_dom, pdev), DEVICE_PGTABLE(prev_dom, pdev), - mode & MAP_WITH_RMRR) < 0; - else - ret = 1; + (mode & MAP_WITH_RMRR) | + MAP_ERROR_RECOVERY) < 0; if ( !ret && pdev && pdev->devfn == devfn ) check_cleanup_domid_map(domain, pdev, iommu); @@ -1742,7 +1704,9 @@ * Strictly speaking if the device is the only one behind this bridge * and the only one with this (secbus,0,0) tuple, it could be allowed * to be re-assigned regardless of RMRR presence. But let's deal with - * that case only if it is actually found in the wild. + * that case only if it is actually found in the wild. Note that + * dealing with this just here would still not render the operation + * secure. */ else if ( prev_present && (mode & MAP_WITH_RMRR) && domain != pdev->domain ) @@ -1808,7 +1772,7 @@ int domain_context_unmap_one( struct domain *domain, struct vtd_iommu *iommu, - uint8_t bus, uint8_t devfn, domid_t domid) + uint8_t bus, uint8_t devfn) { struct context_entry *context, *context_entries; u64 maddr; @@ -1829,18 +1793,12 @@ return 0; } + iommu_domid = context_domain_id(*context); + context_clear_present(*context); context_clear_entry(*context); iommu_sync_cache(context, sizeof(struct context_entry)); - iommu_domid = get_iommu_did(domid, iommu, !domain->is_dying); - if ( iommu_domid == -1 ) - { - spin_unlock(&iommu->lock); - unmap_vtd_domain_page(context_entries); - return -EINVAL; - } - rc = iommu_flush_context_device(iommu, iommu_domid, PCI_BDF2(bus, devfn), DMA_CCMD_MASK_NOBIT, 0); @@ -1866,7 +1824,8 @@ unmap_vtd_domain_page(context_entries); if ( !iommu->drhd->segment && !rc ) - rc = me_wifi_quirk(domain, bus, devfn, domid, 0, UNMAP_ME_PHANTOM_FUNC); + rc = me_wifi_quirk(domain, bus, devfn, DOMID_INVALID, 0, + UNMAP_ME_PHANTOM_FUNC); if ( rc && !is_hardware_domain(domain) && domain != dom_io ) { @@ -1921,8 +1880,7 @@ printk(VTDPREFIX "d%d:PCIe: unmap %04x:%02x:%02x.%u\n", domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); - ret = domain_context_unmap_one(domain, iommu, bus, devfn, - DEVICE_DOMID(domain, pdev)); + ret = domain_context_unmap_one(domain, iommu, bus, devfn); if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 ) disable_ats_device(pdev); @@ -1932,8 +1890,7 @@ if ( iommu_debug ) printk(VTDPREFIX "d%d:PCI: unmap %04x:%02x:%02x.%u\n", domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); - ret = domain_context_unmap_one(domain, iommu, bus, devfn, - DEVICE_DOMID(domain, pdev)); + ret = domain_context_unmap_one(domain, iommu, bus, devfn); if ( ret ) break; @@ -1956,12 +1913,10 @@ break; } - ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn, - DEVICE_DOMID(domain, pdev)); + ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn); /* PCIe to PCI/PCIx bridge */ if ( !ret && pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE ) - ret = domain_context_unmap_one(domain, iommu, secbus, 0, - DEVICE_DOMID(domain, pdev)); + ret = domain_context_unmap_one(domain, iommu, secbus, 0); break; diff -Nru xen-4.14.4+74-gd7b22226b5/xen/drivers/passthrough/vtd/quirks.c xen-4.14.5+24-g87d90d511c/xen/drivers/passthrough/vtd/quirks.c --- xen-4.14.4+74-gd7b22226b5/xen/drivers/passthrough/vtd/quirks.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/drivers/passthrough/vtd/quirks.c 2022-07-12 15:31:49.000000000 +0000 @@ -364,7 +364,7 @@ domid, pgd_maddr, mode); else rc = domain_context_unmap_one(domain, drhd->iommu, 0, - PCI_DEVFN(dev, 7), domid); + PCI_DEVFN(dev, 7)); return rc; } diff -Nru xen-4.14.4+74-gd7b22226b5/xen/drivers/passthrough/vtd/vtd.h xen-4.14.5+24-g87d90d511c/xen/drivers/passthrough/vtd/vtd.h --- xen-4.14.4+74-gd7b22226b5/xen/drivers/passthrough/vtd/vtd.h 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/drivers/passthrough/vtd/vtd.h 2022-07-12 15:31:49.000000000 +0000 @@ -29,7 +29,8 @@ #define MAP_WITH_RMRR (1u << 0) #define MAP_OWNER_DYING (1u << 1) #define MAP_SINGLE_DEVICE (1u << 2) -#define UNMAP_ME_PHANTOM_FUNC (1u << 3) +#define MAP_ERROR_RECOVERY (1u << 3) +#define UNMAP_ME_PHANTOM_FUNC (1u << 4) /* Allow for both IOAPIC and IOSAPIC. */ #define IO_xAPIC_route_entry IO_APIC_route_entry diff -Nru xen-4.14.4+74-gd7b22226b5/xen/drivers/passthrough/vtd/x86/vtd.c xen-4.14.5+24-g87d90d511c/xen/drivers/passthrough/vtd/x86/vtd.c --- xen-4.14.4+74-gd7b22226b5/xen/drivers/passthrough/vtd/x86/vtd.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/drivers/passthrough/vtd/x86/vtd.c 2022-07-12 15:31:49.000000000 +0000 @@ -47,11 +47,6 @@ unmap_domain_page(va); } -unsigned int get_cache_line_size(void) -{ - return ((cpuid_ebx(1) >> 8) & 0xff) * 8; -} - void flush_all_cache() { wbinvd(); diff -Nru xen-4.14.4+74-gd7b22226b5/xen/drivers/vpci/msix.c xen-4.14.5+24-g87d90d511c/xen/drivers/vpci/msix.c --- xen-4.14.4+74-gd7b22226b5/xen/drivers/vpci/msix.c 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/drivers/vpci/msix.c 2022-07-12 15:31:49.000000000 +0000 @@ -258,8 +258,9 @@ if ( !pba ) { gprintk(XENLOG_WARNING, - "%pp: unable to map MSI-X PBA, report all pending\n", - msix->pdev); + "%04x:%02x:%02x.%u: unable to map MSI-X PBA, report all pending\n", + msix->pdev->seg, msix->pdev->bus, PCI_SLOT(msix->pdev->devfn), + PCI_FUNC(msix->pdev->devfn)); return X86EMUL_OKAY; } @@ -342,8 +343,10 @@ { /* Unable to map the PBA, ignore write. */ gprintk(XENLOG_WARNING, - "%pp: unable to map MSI-X PBA, write ignored\n", - msix->pdev); + "%04x:%02x:%02x.%u: unable to map MSI-X PBA, write ignored\n", + msix->pdev->seg, msix->pdev->bus, + PCI_SLOT(msix->pdev->devfn), + PCI_FUNC(msix->pdev->devfn)); return X86EMUL_OKAY; } diff -Nru xen-4.14.4+74-gd7b22226b5/xen/include/asm-x86/cache.h xen-4.14.5+24-g87d90d511c/xen/include/asm-x86/cache.h --- xen-4.14.4+74-gd7b22226b5/xen/include/asm-x86/cache.h 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/include/asm-x86/cache.h 2022-07-12 15:31:49.000000000 +0000 @@ -11,4 +11,11 @@ #define __read_mostly __section(".data.read_mostly") +#ifndef __ASSEMBLY__ + +void cache_flush(const void *addr, unsigned int size); +void cache_writeback(const void *addr, unsigned int size); + +#endif + #endif diff -Nru xen-4.14.4+74-gd7b22226b5/xen/include/asm-x86/cpufeatures.h xen-4.14.5+24-g87d90d511c/xen/include/asm-x86/cpufeatures.h --- xen-4.14.4+74-gd7b22226b5/xen/include/asm-x86/cpufeatures.h 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/include/asm-x86/cpufeatures.h 2022-07-12 15:31:49.000000000 +0000 @@ -33,19 +33,21 @@ XEN_CPUFEATURE(SC_RSB_PV, X86_SYNTH(18)) /* RSB overwrite needed for PV */ XEN_CPUFEATURE(SC_RSB_HVM, X86_SYNTH(19)) /* RSB overwrite needed for HVM */ XEN_CPUFEATURE(XEN_SELFSNOOP, X86_SYNTH(20)) /* SELFSNOOP gets used by Xen itself */ -XEN_CPUFEATURE(SC_MSR_IDLE, X86_SYNTH(21)) /* (SC_MSR_PV || SC_MSR_HVM) && default_xen_spec_ctrl */ +XEN_CPUFEATURE(SC_MSR_IDLE, X86_SYNTH(21)) /* Clear MSR_SPEC_CTRL on idle */ XEN_CPUFEATURE(XEN_LBR, X86_SYNTH(22)) /* Xen uses MSR_DEBUGCTL.LBR */ -XEN_CPUFEATURE(SC_VERW_PV, X86_SYNTH(23)) /* VERW used by Xen for PV */ -XEN_CPUFEATURE(SC_VERW_HVM, X86_SYNTH(24)) /* VERW used by Xen for HVM */ +/* Bits 23,24 unused. */ XEN_CPUFEATURE(SC_VERW_IDLE, X86_SYNTH(25)) /* VERW used by Xen for idle */ XEN_CPUFEATURE(XEN_SHSTK, X86_SYNTH(26)) /* Xen uses CET Shadow Stacks */ XEN_CPUFEATURE(XEN_IBT, X86_SYNTH(27)) /* Xen uses CET Indirect Branch Tracking */ +XEN_CPUFEATURE(IBPB_ENTRY_PV, X86_SYNTH(28)) /* MSR_PRED_CMD used by Xen for PV */ +XEN_CPUFEATURE(IBPB_ENTRY_HVM, X86_SYNTH(29)) /* MSR_PRED_CMD used by Xen for HVM */ /* Bug words follow the synthetic words. */ #define X86_NR_BUG 1 #define X86_BUG(x) ((FSCAPINTS + X86_NR_SYNTH) * 32 + (x)) #define X86_BUG_FPU_PTRS X86_BUG( 0) /* (F)X{SAVE,RSTOR} doesn't save/restore FOP/FIP/FDP. */ +#define X86_BUG_CLFLUSH_MFENCE X86_BUG( 2) /* MFENCE needed to serialise CLFLUSH */ /* Total number of capability words, inc synth and bug words. */ #define NCAPINTS (FSCAPINTS + X86_NR_SYNTH + X86_NR_BUG) /* N 32-bit words worth of info */ diff -Nru xen-4.14.4+74-gd7b22226b5/xen/include/asm-x86/domain.h xen-4.14.5+24-g87d90d511c/xen/include/asm-x86/domain.h --- xen-4.14.4+74-gd7b22226b5/xen/include/asm-x86/domain.h 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/include/asm-x86/domain.h 2022-07-12 15:31:49.000000000 +0000 @@ -308,6 +308,8 @@ uint32_t pci_cf8; uint8_t cmos_idx; + uint8_t spec_ctrl_flags; /* See SCF_DOM_MASK */ + union { struct pv_domain pv; struct hvm_domain hvm; diff -Nru xen-4.14.4+74-gd7b22226b5/xen/include/asm-x86/mm.h xen-4.14.5+24-g87d90d511c/xen/include/asm-x86/mm.h --- xen-4.14.4+74-gd7b22226b5/xen/include/asm-x86/mm.h 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/include/asm-x86/mm.h 2022-07-12 15:31:49.000000000 +0000 @@ -48,8 +48,12 @@ #define _PGT_partial PG_shift(8) #define PGT_partial PG_mask(1, 8) +/* Has this page been mapped writeable with a non-coherent memory type? */ +#define _PGT_non_coherent PG_shift(9) +#define PGT_non_coherent PG_mask(1, 9) + /* Count of uses of this frame as its current type. */ -#define PGT_count_width PG_shift(8) +#define PGT_count_width PG_shift(9) #define PGT_count_mask ((1UL<count_info&PGC_state) == PGC_state_##st) /* Page is not reference counted */ -#define _PGC_extra PG_shift(10) -#define PGC_extra PG_mask(1, 10) +#define _PGC_extra PG_shift(7) +#define PGC_extra PG_mask(1, 7) /* Count of references to this frame. */ -#define PGC_count_width PG_shift(10) +#define PGC_count_width PG_shift(7) #define PGC_count_mask ((1UL< void init_speculation_mitigations(void); +void spec_ctrl_init_domain(struct domain *d); -extern bool opt_ibpb; +extern int8_t opt_ibpb_ctxt_switch; extern bool opt_ssbd; extern int8_t opt_eager_fpu; extern int8_t opt_l1d_flush; @@ -76,7 +108,8 @@ uint32_t val = 0; /* - * Branch Target Injection: + * It is recommended in some cases to clear MSR_SPEC_CTRL when going idle, + * to avoid impacting sibling threads. * * Latch the new shadow value, then enable shadowing, then update the MSR. * There are no SMP issues here; only local processor ordering concerns. @@ -112,7 +145,7 @@ uint32_t val = info->xen_spec_ctrl; /* - * Branch Target Injection: + * Restore MSR_SPEC_CTRL on exit from idle. * * Disable shadowing before updating the MSR. There are no SMP issues * here; only local processor ordering concerns. diff -Nru xen-4.14.4+74-gd7b22226b5/xen/include/asm-x86/spec_ctrl_asm.h xen-4.14.5+24-g87d90d511c/xen/include/asm-x86/spec_ctrl_asm.h --- xen-4.14.4+74-gd7b22226b5/xen/include/asm-x86/spec_ctrl_asm.h 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/include/asm-x86/spec_ctrl_asm.h 2022-07-12 15:31:49.000000000 +0000 @@ -88,6 +88,35 @@ * - SPEC_CTRL_EXIT_TO_{SVM,VMX} */ +.macro DO_SPEC_CTRL_COND_IBPB maybexen:req +/* + * Requires %rsp=regs (also cpuinfo if !maybexen) + * Requires %r14=stack_end (if maybexen), %rdx=0 + * Clobbers %rax, %rcx, %rdx + * + * Conditionally issue IBPB if SCF_entry_ibpb is active. In the maybexen + * case, we can safely look at UREGS_cs to skip taking the hit when + * interrupting Xen. + */ + .if \maybexen + testb $SCF_entry_ibpb, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) + jz .L\@_skip + testb $3, UREGS_cs(%rsp) + .else + testb $SCF_entry_ibpb, CPUINFO_xen_spec_ctrl(%rsp) + .endif + jz .L\@_skip + + mov $MSR_PRED_CMD, %ecx + mov $PRED_CMD_IBPB, %eax + wrmsr + jmp .L\@_done + +.L\@_skip: + lfence +.L\@_done: +.endm + .macro DO_OVERWRITE_RSB tmp=rax /* * Requires nothing @@ -136,6 +165,19 @@ #endif .endm +.macro DO_SPEC_CTRL_COND_VERW +/* + * Requires %rsp=cpuinfo + * + * Issue a VERW for its flushing side effect, if indicated. This is a Spectre + * v1 gadget, but the IRET/VMEntry is serialising. + */ + testb $SCF_verw, CPUINFO_spec_ctrl_flags(%rsp) + jz .L\@_verw_skip + verw CPUINFO_verw_sel(%rsp) +.L\@_verw_skip: +.endm + .macro DO_SPEC_CTRL_ENTRY maybexen:req /* * Requires %rsp=regs (also cpuinfo if !maybexen) @@ -212,12 +254,16 @@ /* Use after an entry from PV context (syscall/sysenter/int80/int82/etc). */ #define SPEC_CTRL_ENTRY_FROM_PV \ + ALTERNATIVE "", __stringify(DO_SPEC_CTRL_COND_IBPB maybexen=0), \ + X86_FEATURE_IBPB_ENTRY_PV; \ ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV; \ ALTERNATIVE "", __stringify(DO_SPEC_CTRL_ENTRY maybexen=0), \ X86_FEATURE_SC_MSR_PV /* Use in interrupt/exception context. May interrupt Xen or PV context. */ #define SPEC_CTRL_ENTRY_FROM_INTR \ + ALTERNATIVE "", __stringify(DO_SPEC_CTRL_COND_IBPB maybexen=1), \ + X86_FEATURE_IBPB_ENTRY_PV; \ ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV; \ ALTERNATIVE "", __stringify(DO_SPEC_CTRL_ENTRY maybexen=1), \ X86_FEATURE_SC_MSR_PV @@ -231,45 +277,52 @@ #define SPEC_CTRL_EXIT_TO_PV \ ALTERNATIVE "", \ DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV; \ - ALTERNATIVE "", __stringify(verw CPUINFO_verw_sel(%rsp)), \ - X86_FEATURE_SC_VERW_PV + DO_SPEC_CTRL_COND_VERW /* * Use in IST interrupt/exception context. May interrupt Xen or PV context. - * Fine grain control of SCF_ist_wrmsr is needed for safety in the S3 resume - * path to avoid using MSR_SPEC_CTRL before the microcode introducing it has - * been reloaded. */ .macro SPEC_CTRL_ENTRY_FROM_INTR_IST /* - * Requires %rsp=regs, %r14=stack_end - * Clobbers %rax, %rcx, %rdx + * Requires %rsp=regs, %r14=stack_end, %rdx=0 + * Clobbers %rax, %rbx, %rcx, %rdx * - * This is logical merge of DO_OVERWRITE_RSB and DO_SPEC_CTRL_ENTRY - * maybexen=1, but with conditionals rather than alternatives. + * This is logical merge of: + * DO_SPEC_CTRL_COND_IBPB maybexen=0 + * DO_OVERWRITE_RSB + * DO_SPEC_CTRL_ENTRY maybexen=1 + * but with conditionals rather than alternatives. */ - movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %eax + movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %ebx - test $SCF_ist_rsb, %al + test $SCF_ist_ibpb, %bl + jz .L\@_skip_ibpb + + mov $MSR_PRED_CMD, %ecx + mov $PRED_CMD_IBPB, %eax + wrmsr + +.L\@_skip_ibpb: + + test $SCF_ist_rsb, %bl jz .L\@_skip_rsb - DO_OVERWRITE_RSB tmp=rdx /* Clobbers %rcx/%rdx */ + DO_OVERWRITE_RSB /* Clobbers %rax/%rcx */ .L\@_skip_rsb: - test $SCF_ist_wrmsr, %al - jz .L\@_skip_wrmsr + test $SCF_ist_sc_msr, %bl + jz .L\@_skip_msr_spec_ctrl - xor %edx, %edx + xor %eax, %eax testb $3, UREGS_cs(%rsp) - setnz %dl - not %edx - and %dl, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) + setnz %al + not %eax + and %al, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) /* Load Xen's intended value. */ mov $MSR_SPEC_CTRL, %ecx movzbl STACK_CPUINFO_FIELD(xen_spec_ctrl)(%r14), %eax - xor %edx, %edx wrmsr /* Opencoded UNLIKELY_START() with no condition. */ @@ -281,7 +334,7 @@ * to speculate around the WRMSR. As a result, we need a dispatch * serialising instruction in the else clause. */ -.L\@_skip_wrmsr: +.L\@_skip_msr_spec_ctrl: lfence UNLIKELY_END(\@_serialise) .endm @@ -292,7 +345,7 @@ * Requires %rbx=stack_end * Clobbers %rax, %rcx, %rdx */ - testb $SCF_ist_wrmsr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx) + testb $SCF_ist_sc_msr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx) jz .L\@_skip DO_SPEC_CTRL_EXIT_TO_XEN diff -Nru xen-4.14.4+74-gd7b22226b5/xen/include/public/arch-x86/cpufeatureset.h xen-4.14.5+24-g87d90d511c/xen/include/public/arch-x86/cpufeatureset.h --- xen-4.14.4+74-gd7b22226b5/xen/include/public/arch-x86/cpufeatureset.h 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/include/public/arch-x86/cpufeatureset.h 2022-07-12 15:31:49.000000000 +0000 @@ -264,6 +264,7 @@ XEN_CPUFEATURE(VIRT_SSBD, 8*32+25) /* MSR_VIRT_SPEC_CTRL.SSBD */ XEN_CPUFEATURE(SSB_NO, 8*32+26) /*A Hardware not vulnerable to SSB */ XEN_CPUFEATURE(PSFD, 8*32+28) /*S MSR_SPEC_CTRL.PSFD */ +XEN_CPUFEATURE(BTC_NO, 8*32+29) /*A Hardware not vulnerable to Branch Type Confusion */ /* Intel-defined CPU features, CPUID level 0x00000007:0.edx, word 9 */ XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A AVX512 Neural Network Instructions */ diff -Nru xen-4.14.4+74-gd7b22226b5/xen/include/xen/lib.h xen-4.14.5+24-g87d90d511c/xen/include/xen/lib.h --- xen-4.14.4+74-gd7b22226b5/xen/include/xen/lib.h 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/include/xen/lib.h 2022-07-12 15:31:49.000000000 +0000 @@ -82,7 +82,8 @@ /** * Given a specific name, parses a string of the form: * [no-]$NAME[=...] - * returning 0 or 1 for a recognised boolean, or -1 for an error. + * returning 0 or 1 for a recognised boolean. Returns -1 for general errors, + * and -2 for "not a boolean, but $NAME= matches". */ int parse_boolean(const char *name, const char *s, const char *e); diff -Nru xen-4.14.4+74-gd7b22226b5/xen/include/xen/livepatch_elf.h xen-4.14.5+24-g87d90d511c/xen/include/xen/livepatch_elf.h --- xen-4.14.4+74-gd7b22226b5/xen/include/xen/livepatch_elf.h 2022-04-07 07:11:53.000000000 +0000 +++ xen-4.14.5+24-g87d90d511c/xen/include/xen/livepatch_elf.h 2022-07-12 15:31:49.000000000 +0000 @@ -22,6 +22,7 @@ struct livepatch_elf_sym { const Elf_Sym *sym; const char *name; + bool ignored; }; struct livepatch_elf { @@ -48,7 +49,7 @@ static inline bool livepatch_elf_ignore_section(const Elf_Shdr *sec) { - return !(sec->sh_flags & SHF_ALLOC) || sec->sh_size == 0; + return !(sec->sh_flags & SHF_ALLOC); } #endif /* __XEN_LIVEPATCH_ELF_H__ */