Version in base suite: 4.17.3+10-g091466ba55-1~deb12u1 Base version: xen_4.17.3+10-g091466ba55-1~deb12u1 Target version: xen_4.17.5-1~deb12u1 Base file: /srv/ftp-master.debian.org/ftp/pool/main/x/xen/xen_4.17.3+10-g091466ba55-1~deb12u1.dsc Target file: /srv/ftp-master.debian.org/policy/pool/main/x/xen/xen_4.17.5-1~deb12u1.dsc .cirrus.yml | 2 CHANGELOG.md | 7 LICENSES/MIT-0 | 31 SUPPORT.md | 5 config/Tools.mk.in | 2 debian/changelog | 28 debian/patches/0002-Delete-configure-output.patch | 507 -------- debian/patches/0003-Display-Debian-package-version-in-hypervisor-log.patch | 2 debian/patches/0011-config-Tools.mk.in-Respect-caller-s-CONFIG_PV_SHIM.patch | 4 debian/patches/0022-give-meaningful-error-message-if-qemu-device-model-i.patch | 4 docs/misc/xen-command-line.pandoc | 64 - m4/systemd.m4 | 17 tools/config.h.in | 3 tools/configure | 497 -------- tools/configure.ac | 2 tools/firmware/etherboot/Makefile | 2 tools/firmware/hvmloader/pci.c | 28 tools/hotplug/Linux/block-common.sh | 8 tools/include/xen-sd-notify.h | 98 + tools/libs/guest/xg_dom_core.c | 2 tools/libs/light/libxl_console.c | 11 tools/libs/light/libxl_device.c | 72 - tools/libs/light/libxl_dm.c | 14 tools/libs/light/libxl_x86_acpi.c | 6 tools/libs/light/libxl_xshelp.c | 13 tools/libs/store/xs.c | 64 - tools/misc/xen-cpuid.c | 5 tools/misc/xen-ucode.c | 5 tools/ocaml/xenstored/Makefile | 3 tools/ocaml/xenstored/quota.ml | 65 - tools/ocaml/xenstored/store.ml | 17 tools/ocaml/xenstored/systemd_stubs.c | 2 tools/tests/resource/test-resource.c | 39 tools/tests/tsx/test-tsx.c | 39 tools/tests/xenstore/test-xenstore.c | 12 tools/xcutils/lsevtchn.c | 22 tools/xenstore/Makefile | 5 tools/xenstore/xenstored_core.c | 4 tools/xentop/xentop.c | 23 tools/xl/xl_utils.c | 6 xen/Makefile | 7 xen/Rules.mk | 4 xen/arch/arm/alternative.c | 6 xen/arch/arm/include/asm/alternative.h | 12 xen/arch/arm/irq.c | 2 xen/arch/arm/setup.c | 2 xen/arch/x86/Makefile | 3 xen/arch/x86/acpi/cpufreq/cpufreq.c | 17 xen/arch/x86/acpi/cpufreq/powernow.c | 3 xen/arch/x86/acpi/power.c | 4 xen/arch/x86/alternative.c | 2 xen/arch/x86/bhb-thunk.S | 102 + xen/arch/x86/cpu-policy.c | 109 + xen/arch/x86/cpu/amd.c | 7 xen/arch/x86/cpu/centaur.c | 2 xen/arch/x86/cpu/common.c | 27 xen/arch/x86/cpu/cpu.h | 2 xen/arch/x86/cpu/hygon.c | 2 xen/arch/x86/cpu/intel.c | 36 xen/arch/x86/cpu/mcheck/mcaction.c | 10 xen/arch/x86/cpu/mcheck/mcaction.h | 5 xen/arch/x86/cpu/mcheck/mce.c | 76 - xen/arch/x86/cpu/mcheck/mce.h | 74 - xen/arch/x86/cpu/mcheck/mce_amd.c | 24 xen/arch/x86/cpu/mcheck/mce_intel.c | 20 xen/arch/x86/cpu/microcode/amd.c | 7 xen/arch/x86/cpu/microcode/core.c | 4 xen/arch/x86/cpu/microcode/intel.c | 7 xen/arch/x86/cpu/mtrr/generic.c | 26 xen/arch/x86/cpu/mtrr/main.c | 79 - xen/arch/x86/cpu/mtrr/mtrr.h | 37 xen/arch/x86/cpu/shanghai.c | 2 xen/arch/x86/cpuid.c | 30 xen/arch/x86/domain.c | 15 xen/arch/x86/extable.c | 28 xen/arch/x86/genapic/bigsmp.c | 2 xen/arch/x86/genapic/default.c | 2 xen/arch/x86/genapic/probe.c | 2 xen/arch/x86/genapic/x2apic.c | 6 xen/arch/x86/guest/hyperv/hyperv.c | 2 xen/arch/x86/guest/hypervisor.c | 6 xen/arch/x86/guest/xen/xen.c | 2 xen/arch/x86/hpet.c | 4 xen/arch/x86/hvm/emulate.c | 7 xen/arch/x86/hvm/hvm.c | 36 xen/arch/x86/hvm/rtc.c | 1 xen/arch/x86/hvm/svm/entry.S | 26 xen/arch/x86/hvm/svm/svm.c | 4 xen/arch/x86/hvm/vmx/entry.S | 65 + xen/arch/x86/hvm/vmx/vmcs.c | 86 + xen/arch/x86/hvm/vmx/vmx.c | 66 + xen/arch/x86/hvm/vpt.c | 10 xen/arch/x86/hypercall.c | 36 xen/arch/x86/include/asm/alternative.h | 59 - xen/arch/x86/include/asm/apic.h | 5 xen/arch/x86/include/asm/asm-defns.h | 18 xen/arch/x86/include/asm/asm_defns.h | 8 xen/arch/x86/include/asm/cpufeature.h | 4 xen/arch/x86/include/asm/cpufeatures.h | 10 xen/arch/x86/include/asm/current.h | 10 xen/arch/x86/include/asm/domain.h | 2 xen/arch/x86/include/asm/hpet.h | 4 xen/arch/x86/include/asm/hvm/hvm.h | 18 xen/arch/x86/include/asm/hvm/vmx/vmcs.h | 18 xen/arch/x86/include/asm/hvm/vmx/vmx.h | 1 xen/arch/x86/include/asm/intel-family.h | 38 xen/arch/x86/include/asm/io.h | 10 xen/arch/x86/include/asm/irq.h | 24 xen/arch/x86/include/asm/mach-generic/mach_apic.h | 2 xen/arch/x86/include/asm/mm.h | 4 xen/arch/x86/include/asm/msr-index.h | 3 xen/arch/x86/include/asm/msr.h | 7 xen/arch/x86/include/asm/nospec.h | 26 xen/arch/x86/include/asm/paging.h | 5 xen/arch/x86/include/asm/setup.h | 2 xen/arch/x86/include/asm/spec_ctrl.h | 22 xen/arch/x86/include/asm/spec_ctrl_asm.h | 219 ++- xen/arch/x86/include/asm/uaccess.h | 3 xen/arch/x86/io_apic.c | 23 xen/arch/x86/ioport_emulate.c | 9 xen/arch/x86/irq.c | 204 ++- xen/arch/x86/livepatch.c | 4 xen/arch/x86/mm.c | 43 xen/arch/x86/mm/hap/hap.c | 5 xen/arch/x86/mm/mm-locks.h | 37 xen/arch/x86/mm/p2m-ept.c | 41 xen/arch/x86/mm/p2m-pod.c | 17 xen/arch/x86/mm/p2m.c | 13 xen/arch/x86/mm/paging.c | 2 xen/arch/x86/mm/shadow/common.c | 2 xen/arch/x86/mm/shadow/multi.c | 17 xen/arch/x86/mm/shadow/none.c | 3 xen/arch/x86/msi.c | 4 xen/arch/x86/msr.c | 9 xen/arch/x86/nmi.c | 69 - xen/arch/x86/platform_hypercall.c | 2 xen/arch/x86/pv/dom0_build.c | 2 xen/arch/x86/pv/emul-priv-op.c | 4 xen/arch/x86/setup.c | 6 xen/arch/x86/smp.c | 2 xen/arch/x86/spec_ctrl.c | 571 ++++++++-- xen/arch/x86/time.c | 12 xen/arch/x86/traps.c | 63 - xen/arch/x86/tsx.c | 55 xen/arch/x86/x86_64/asm-offsets.c | 29 xen/arch/x86/x86_64/compat/entry.S | 14 xen/arch/x86/x86_64/entry.S | 169 +- xen/arch/x86/x86_emulate/x86_emulate.c | 5 xen/arch/x86/xstate.c | 18 xen/build.mk | 2 xen/common/Kconfig | 17 xen/common/bunzip2.c | 3 xen/common/core_parking.c | 21 xen/common/cpu.c | 5 xen/common/domain.c | 24 xen/common/domctl.c | 4 xen/common/event_channel.c | 18 xen/common/grant_table.c | 6 xen/common/irq.c | 2 xen/common/livepatch.c | 172 ++- xen/common/rwlock.c | 20 xen/common/sched/compat.c | 4 xen/common/sched/core.c | 86 - xen/common/sched/private.h | 26 xen/common/timer.c | 8 xen/common/ubsan/ubsan.h | 2 xen/common/virtual_region.c | 77 - xen/drivers/acpi/pmstat.c | 3 xen/drivers/cpufreq/cpufreq.c | 6 xen/drivers/cpufreq/utility.c | 6 xen/drivers/passthrough/amd/iommu_acpi.c | 11 xen/drivers/passthrough/pci.c | 5 xen/drivers/passthrough/vtd/x86/ats.c | 10 xen/drivers/passthrough/x86/iommu.c | 30 xen/include/hypercall-defs.c | 2 xen/include/public/arch-x86/cpufeatureset.h | 17 xen/include/xen/alternative-call.h | 7 xen/include/xen/cpu.h | 10 xen/include/xen/elfstructs.h | 2 xen/include/xen/event.h | 4 xen/include/xen/irq.h | 26 xen/include/xen/livepatch.h | 32 xen/include/xen/mm.h | 3 xen/include/xen/nospec.h | 15 xen/include/xen/param.h | 2 xen/include/xen/pci.h | 8 xen/include/xen/rwlock.h | 50 xen/include/xen/spinlock.h | 45 xen/include/xen/virtual_region.h | 14 xen/include/xen/xmalloc.h | 7 xen/include/xsm/dummy.h | 2 xen/include/xsm/xsm.h | 7 xen/test/livepatch/Makefile | 4 xen/test/livepatch/xen_action_hooks_norevert.c | 24 xen/tools/gen-cpuid.py | 7 xen/xsm/flask/hooks.c | 14 196 files changed, 3419 insertions(+), 2323 deletions(-) diff -Nru xen-4.17.3+10-g091466ba55/.cirrus.yml xen-4.17.5/.cirrus.yml --- xen-4.17.3+10-g091466ba55/.cirrus.yml 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/.cirrus.yml 2024-08-14 09:03:57.000000000 +0000 @@ -17,7 +17,7 @@ task: name: 'FreeBSD 13' freebsd_instance: - image_family: freebsd-13-2 + image_family: freebsd-13-3 << : *FREEBSD_TEMPLATE task: diff -Nru xen-4.17.3+10-g091466ba55/CHANGELOG.md xen-4.17.5/CHANGELOG.md --- xen-4.17.3+10-g091466ba55/CHANGELOG.md 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/CHANGELOG.md 2024-08-14 09:03:57.000000000 +0000 @@ -4,6 +4,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) +## [4.17.5](https://xenbits.xen.org/gitweb/?p=xen.git;a=shortlog;h=RELEASE-4.17.5) + +### Changed + - When building with Systemd support (./configure --enable-systemd), remove + libsystemd as a build dependency. Systemd Notify support is retained, now + using a standalone library implementation. + ## [4.17.3](https://xenbits.xen.org/gitweb/?p=xen.git;a=shortlog;h=RELEASE-4.17.3) ### Changed diff -Nru xen-4.17.3+10-g091466ba55/LICENSES/MIT-0 xen-4.17.5/LICENSES/MIT-0 --- xen-4.17.3+10-g091466ba55/LICENSES/MIT-0 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.17.5/LICENSES/MIT-0 2024-08-14 09:03:57.000000000 +0000 @@ -0,0 +1,31 @@ +Valid-License-Identifier: MIT-0 + +SPDX-URL: https://spdx.org/licenses/MIT-0.html + +Usage-Guide: + + To use the MIT-0 License put the following SPDX tag/value pair into a + comment according to the placement guidelines in the licensing rules + documentation: + SPDX-License-Identifier: MIT-0 + +License-Text: + +MIT No Attribution + +Copyright + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff -Nru xen-4.17.3+10-g091466ba55/SUPPORT.md xen-4.17.5/SUPPORT.md --- xen-4.17.3+10-g091466ba55/SUPPORT.md 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/SUPPORT.md 2024-08-14 09:03:57.000000000 +0000 @@ -775,6 +775,11 @@ Only systems using IOMMUs are supported. +Passing through of devices sharing resources with another device is not +security supported. Such sharing could e.g. be the same line interrupt being +used by multiple devices, one of which is to be passed through, or two such +devices having memory BARs within the same 4k page. + Not compatible with migration, populate-on-demand, altp2m, introspection, memory sharing, or memory paging. diff -Nru xen-4.17.3+10-g091466ba55/config/Tools.mk.in xen-4.17.5/config/Tools.mk.in --- xen-4.17.3+10-g091466ba55/config/Tools.mk.in 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/config/Tools.mk.in 2024-08-14 09:03:57.000000000 +0000 @@ -56,8 +56,6 @@ CONFIG_GOLANG := @golang@ CONFIG_SYSTEMD := @systemd@ -SYSTEMD_CFLAGS := @SYSTEMD_CFLAGS@ -SYSTEMD_LIBS := @SYSTEMD_LIBS@ XEN_SYSTEMD_DIR := @SYSTEMD_DIR@ XEN_SYSTEMD_MODULES_LOAD := @SYSTEMD_MODULES_LOAD@ CONFIG_9PFS := @ninepfs@ diff -Nru xen-4.17.3+10-g091466ba55/debian/changelog xen-4.17.5/debian/changelog --- xen-4.17.3+10-g091466ba55/debian/changelog 2024-02-04 15:31:59.000000000 +0000 +++ xen-4.17.5/debian/changelog 2024-08-18 18:33:38.000000000 +0000 @@ -1,3 +1,31 @@ +xen (4.17.5-1~deb12u1) bookworm; urgency=medium + + * Update to new upstream version 4.17.5, which also contains + security fixes for the following issues: + - x86: shadow stack vs exceptions from emulation stubs + XSA-451 CVE-2023-46841 + - x86: Register File Data Sampling + XSA-452 CVE-2023-28746 + - GhostRace: Speculative Race Conditions + XSA-453 CVE-2024-2193 + - x86 HVM hypercalls may trigger Xen bug check + XSA-454 CVE-2023-46842 + - x86: Incorrect logic for BTC/SRSO mitigations + XSA-455 CVE-2024-31142 + - x86: Native Branch History Injection + XSA-456 CVE-2024-2201 + - double unlock in x86 guest IRQ handling + XSA-458 CVE-2024-31143 + - error handling in x86 IOMMU identity mapping + XSA-460 CVE-2024-31145 + - PCI device pass-through with shared resources + XSA-461 CVE-2024-31146 + * Note that the following XSA are not listed, because... + - XSA-457 has patches for the Linux kernel. + - XSA-459 is within Xapi which is not shipped by this package. + + -- Maximilian Engelhardt Sun, 18 Aug 2024 20:33:38 +0200 + xen (4.17.3+10-g091466ba55-1~deb12u1) bookworm; urgency=medium * Rebuild 4.17.3+10-g091466ba55-1 for Bookworm to address the security diff -Nru xen-4.17.3+10-g091466ba55/debian/patches/0002-Delete-configure-output.patch xen-4.17.5/debian/patches/0002-Delete-configure-output.patch --- xen-4.17.3+10-g091466ba55/debian/patches/0002-Delete-configure-output.patch 2024-02-04 15:31:59.000000000 +0000 +++ xen-4.17.5/debian/patches/0002-Delete-configure-output.patch 2024-08-18 18:33:38.000000000 +0000 @@ -10,10 +10,10 @@ Signed-off-by: Ian Jackson --- - configure | 3671 ----------------- + configure | 3671 ------------------ docs/configure | 3483 ----------------- - tools/configure | 11490 ------------------------------------------------------ - 3 files changed, 18644 deletions(-) + tools/configure | 11019 ------------------------------------------------------ + 3 files changed, 18173 deletions(-) delete mode 100755 configure delete mode 100755 docs/configure delete mode 100755 tools/configure @@ -7186,10 +7186,10 @@ - diff --git a/tools/configure b/tools/configure deleted file mode 100755 -index f1176d1..0000000 +index 402364f..0000000 --- a/tools/configure +++ /dev/null -@@ -1,11490 +0,0 @@ +@@ -1,11019 +0,0 @@ -#! /bin/sh -# Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for Xen Hypervisor Tools 4.17. @@ -7818,8 +7818,6 @@ -LIBOBJS -pvshim -ninepfs --SYSTEMD_LIBS --SYSTEMD_CFLAGS -SYSTEMD_MODULES_LOAD -SYSTEMD_DIR -systemd @@ -8054,9 +8052,7 @@ -libzstd_CFLAGS -libzstd_LIBS -LIBNL3_CFLAGS --LIBNL3_LIBS --SYSTEMD_CFLAGS --SYSTEMD_LIBS' +-LIBNL3_LIBS' - - -# Initialize some variables set by options. @@ -8810,10 +8806,6 @@ - LIBNL3_CFLAGS - C compiler flags for LIBNL3, overriding pkg-config - LIBNL3_LIBS linker flags for LIBNL3, overriding pkg-config -- SYSTEMD_CFLAGS -- C compiler flags for SYSTEMD, overriding pkg-config -- SYSTEMD_LIBS -- linker flags for SYSTEMD, overriding pkg-config - -Use these variables to override the choices made by `configure' or to help -it to find libraries and programs with nonstandard names/locations. @@ -11081,8 +11073,6 @@ - - - -- -- -test "x$prefix" = "xNONE" && prefix=$ac_default_prefix -test "x$exec_prefix" = "xNONE" && exec_prefix=${prefix} - @@ -16680,223 +16670,6 @@ - - - -- --pkg_failed=no --{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SYSTEMD" >&5 --$as_echo_n "checking for SYSTEMD... " >&6; } -- --if test -n "$SYSTEMD_CFLAGS"; then -- pkg_cv_SYSTEMD_CFLAGS="$SYSTEMD_CFLAGS" -- elif test -n "$PKG_CONFIG"; then -- if test -n "$PKG_CONFIG" && \ -- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd-daemon\""; } >&5 -- ($PKG_CONFIG --exists --print-errors "libsystemd-daemon") 2>&5 -- ac_status=$? -- $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 -- test $ac_status = 0; }; then -- pkg_cv_SYSTEMD_CFLAGS=`$PKG_CONFIG --cflags "libsystemd-daemon" 2>/dev/null` -- test "x$?" != "x0" && pkg_failed=yes --else -- pkg_failed=yes --fi -- else -- pkg_failed=untried --fi --if test -n "$SYSTEMD_LIBS"; then -- pkg_cv_SYSTEMD_LIBS="$SYSTEMD_LIBS" -- elif test -n "$PKG_CONFIG"; then -- if test -n "$PKG_CONFIG" && \ -- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd-daemon\""; } >&5 -- ($PKG_CONFIG --exists --print-errors "libsystemd-daemon") 2>&5 -- ac_status=$? -- $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 -- test $ac_status = 0; }; then -- pkg_cv_SYSTEMD_LIBS=`$PKG_CONFIG --libs "libsystemd-daemon" 2>/dev/null` -- test "x$?" != "x0" && pkg_failed=yes --else -- pkg_failed=yes --fi -- else -- pkg_failed=untried --fi -- -- -- --if test $pkg_failed = yes; then -- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 --$as_echo "no" >&6; } -- --if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then -- _pkg_short_errors_supported=yes --else -- _pkg_short_errors_supported=no --fi -- if test $_pkg_short_errors_supported = yes; then -- SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libsystemd-daemon" 2>&1` -- else -- SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libsystemd-daemon" 2>&1` -- fi -- # Put the nasty error message in config.log where it belongs -- echo "$SYSTEMD_PKG_ERRORS" >&5 -- -- -- --pkg_failed=no --{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SYSTEMD" >&5 --$as_echo_n "checking for SYSTEMD... " >&6; } -- --if test -n "$SYSTEMD_CFLAGS"; then -- pkg_cv_SYSTEMD_CFLAGS="$SYSTEMD_CFLAGS" -- elif test -n "$PKG_CONFIG"; then -- if test -n "$PKG_CONFIG" && \ -- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd >= 209\""; } >&5 -- ($PKG_CONFIG --exists --print-errors "libsystemd >= 209") 2>&5 -- ac_status=$? -- $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 -- test $ac_status = 0; }; then -- pkg_cv_SYSTEMD_CFLAGS=`$PKG_CONFIG --cflags "libsystemd >= 209" 2>/dev/null` -- test "x$?" != "x0" && pkg_failed=yes --else -- pkg_failed=yes --fi -- else -- pkg_failed=untried --fi --if test -n "$SYSTEMD_LIBS"; then -- pkg_cv_SYSTEMD_LIBS="$SYSTEMD_LIBS" -- elif test -n "$PKG_CONFIG"; then -- if test -n "$PKG_CONFIG" && \ -- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd >= 209\""; } >&5 -- ($PKG_CONFIG --exists --print-errors "libsystemd >= 209") 2>&5 -- ac_status=$? -- $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 -- test $ac_status = 0; }; then -- pkg_cv_SYSTEMD_LIBS=`$PKG_CONFIG --libs "libsystemd >= 209" 2>/dev/null` -- test "x$?" != "x0" && pkg_failed=yes --else -- pkg_failed=yes --fi -- else -- pkg_failed=untried --fi -- -- -- --if test $pkg_failed = yes; then -- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 --$as_echo "no" >&6; } -- --if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then -- _pkg_short_errors_supported=yes --else -- _pkg_short_errors_supported=no --fi -- if test $_pkg_short_errors_supported = yes; then -- SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libsystemd >= 209" 2>&1` -- else -- SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libsystemd >= 209" 2>&1` -- fi -- # Put the nasty error message in config.log where it belongs -- echo "$SYSTEMD_PKG_ERRORS" >&5 -- -- systemd="n" --elif test $pkg_failed = untried; then -- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 --$as_echo "no" >&6; } -- systemd="n" --else -- SYSTEMD_CFLAGS=$pkg_cv_SYSTEMD_CFLAGS -- SYSTEMD_LIBS=$pkg_cv_SYSTEMD_LIBS -- { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 --$as_echo "yes" >&6; } -- systemd="y" --fi -- --elif test $pkg_failed = untried; then -- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 --$as_echo "no" >&6; } -- -- --pkg_failed=no --{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SYSTEMD" >&5 --$as_echo_n "checking for SYSTEMD... " >&6; } -- --if test -n "$SYSTEMD_CFLAGS"; then -- pkg_cv_SYSTEMD_CFLAGS="$SYSTEMD_CFLAGS" -- elif test -n "$PKG_CONFIG"; then -- if test -n "$PKG_CONFIG" && \ -- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd >= 209\""; } >&5 -- ($PKG_CONFIG --exists --print-errors "libsystemd >= 209") 2>&5 -- ac_status=$? -- $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 -- test $ac_status = 0; }; then -- pkg_cv_SYSTEMD_CFLAGS=`$PKG_CONFIG --cflags "libsystemd >= 209" 2>/dev/null` -- test "x$?" != "x0" && pkg_failed=yes --else -- pkg_failed=yes --fi -- else -- pkg_failed=untried --fi --if test -n "$SYSTEMD_LIBS"; then -- pkg_cv_SYSTEMD_LIBS="$SYSTEMD_LIBS" -- elif test -n "$PKG_CONFIG"; then -- if test -n "$PKG_CONFIG" && \ -- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd >= 209\""; } >&5 -- ($PKG_CONFIG --exists --print-errors "libsystemd >= 209") 2>&5 -- ac_status=$? -- $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 -- test $ac_status = 0; }; then -- pkg_cv_SYSTEMD_LIBS=`$PKG_CONFIG --libs "libsystemd >= 209" 2>/dev/null` -- test "x$?" != "x0" && pkg_failed=yes --else -- pkg_failed=yes --fi -- else -- pkg_failed=untried --fi -- -- -- --if test $pkg_failed = yes; then -- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 --$as_echo "no" >&6; } -- --if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then -- _pkg_short_errors_supported=yes --else -- _pkg_short_errors_supported=no --fi -- if test $_pkg_short_errors_supported = yes; then -- SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libsystemd >= 209" 2>&1` -- else -- SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libsystemd >= 209" 2>&1` -- fi -- # Put the nasty error message in config.log where it belongs -- echo "$SYSTEMD_PKG_ERRORS" >&5 -- -- systemd="n" --elif test $pkg_failed = untried; then -- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 --$as_echo "no" >&6; } -- systemd="n" --else -- SYSTEMD_CFLAGS=$pkg_cv_SYSTEMD_CFLAGS -- SYSTEMD_LIBS=$pkg_cv_SYSTEMD_LIBS -- { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 --$as_echo "yes" >&6; } -- systemd="y" --fi -- --else -- SYSTEMD_CFLAGS=$pkg_cv_SYSTEMD_CFLAGS -- SYSTEMD_LIBS=$pkg_cv_SYSTEMD_LIBS -- { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 --$as_echo "yes" >&6; } -- systemd="y" --fi -- -- - if test "x$enable_systemd" != "xno"; then : - - if test "x$systemd" = "xy" ; then : @@ -16906,262 +16679,6 @@ - - systemd=y - -- --pkg_failed=no --{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SYSTEMD" >&5 --$as_echo_n "checking for SYSTEMD... " >&6; } -- --if test -n "$SYSTEMD_CFLAGS"; then -- pkg_cv_SYSTEMD_CFLAGS="$SYSTEMD_CFLAGS" -- elif test -n "$PKG_CONFIG"; then -- if test -n "$PKG_CONFIG" && \ -- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd-daemon\""; } >&5 -- ($PKG_CONFIG --exists --print-errors "libsystemd-daemon") 2>&5 -- ac_status=$? -- $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 -- test $ac_status = 0; }; then -- pkg_cv_SYSTEMD_CFLAGS=`$PKG_CONFIG --cflags "libsystemd-daemon" 2>/dev/null` -- test "x$?" != "x0" && pkg_failed=yes --else -- pkg_failed=yes --fi -- else -- pkg_failed=untried --fi --if test -n "$SYSTEMD_LIBS"; then -- pkg_cv_SYSTEMD_LIBS="$SYSTEMD_LIBS" -- elif test -n "$PKG_CONFIG"; then -- if test -n "$PKG_CONFIG" && \ -- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd-daemon\""; } >&5 -- ($PKG_CONFIG --exists --print-errors "libsystemd-daemon") 2>&5 -- ac_status=$? -- $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 -- test $ac_status = 0; }; then -- pkg_cv_SYSTEMD_LIBS=`$PKG_CONFIG --libs "libsystemd-daemon" 2>/dev/null` -- test "x$?" != "x0" && pkg_failed=yes --else -- pkg_failed=yes --fi -- else -- pkg_failed=untried --fi -- -- -- --if test $pkg_failed = yes; then -- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 --$as_echo "no" >&6; } -- --if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then -- _pkg_short_errors_supported=yes --else -- _pkg_short_errors_supported=no --fi -- if test $_pkg_short_errors_supported = yes; then -- SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libsystemd-daemon" 2>&1` -- else -- SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libsystemd-daemon" 2>&1` -- fi -- # Put the nasty error message in config.log where it belongs -- echo "$SYSTEMD_PKG_ERRORS" >&5 -- -- --pkg_failed=no --{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SYSTEMD" >&5 --$as_echo_n "checking for SYSTEMD... " >&6; } -- --if test -n "$SYSTEMD_CFLAGS"; then -- pkg_cv_SYSTEMD_CFLAGS="$SYSTEMD_CFLAGS" -- elif test -n "$PKG_CONFIG"; then -- if test -n "$PKG_CONFIG" && \ -- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd >= 209\""; } >&5 -- ($PKG_CONFIG --exists --print-errors "libsystemd >= 209") 2>&5 -- ac_status=$? -- $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 -- test $ac_status = 0; }; then -- pkg_cv_SYSTEMD_CFLAGS=`$PKG_CONFIG --cflags "libsystemd >= 209" 2>/dev/null` -- test "x$?" != "x0" && pkg_failed=yes --else -- pkg_failed=yes --fi -- else -- pkg_failed=untried --fi --if test -n "$SYSTEMD_LIBS"; then -- pkg_cv_SYSTEMD_LIBS="$SYSTEMD_LIBS" -- elif test -n "$PKG_CONFIG"; then -- if test -n "$PKG_CONFIG" && \ -- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd >= 209\""; } >&5 -- ($PKG_CONFIG --exists --print-errors "libsystemd >= 209") 2>&5 -- ac_status=$? -- $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 -- test $ac_status = 0; }; then -- pkg_cv_SYSTEMD_LIBS=`$PKG_CONFIG --libs "libsystemd >= 209" 2>/dev/null` -- test "x$?" != "x0" && pkg_failed=yes --else -- pkg_failed=yes --fi -- else -- pkg_failed=untried --fi -- -- -- --if test $pkg_failed = yes; then -- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 --$as_echo "no" >&6; } -- --if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then -- _pkg_short_errors_supported=yes --else -- _pkg_short_errors_supported=no --fi -- if test $_pkg_short_errors_supported = yes; then -- SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libsystemd >= 209" 2>&1` -- else -- SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libsystemd >= 209" 2>&1` -- fi -- # Put the nasty error message in config.log where it belongs -- echo "$SYSTEMD_PKG_ERRORS" >&5 -- -- as_fn_error $? "Package requirements (libsystemd >= 209) were not met: -- --$SYSTEMD_PKG_ERRORS -- --Consider adjusting the PKG_CONFIG_PATH environment variable if you --installed software in a non-standard prefix. -- --Alternatively, you may set the environment variables SYSTEMD_CFLAGS --and SYSTEMD_LIBS to avoid the need to call pkg-config. --See the pkg-config man page for more details." "$LINENO" 5 --elif test $pkg_failed = untried; then -- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 --$as_echo "no" >&6; } -- { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 --$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} --as_fn_error $? "The pkg-config script could not be found or is too old. Make sure it --is in your PATH or set the PKG_CONFIG environment variable to the full --path to pkg-config. -- --Alternatively, you may set the environment variables SYSTEMD_CFLAGS --and SYSTEMD_LIBS to avoid the need to call pkg-config. --See the pkg-config man page for more details. -- --To get pkg-config, see . --See \`config.log' for more details" "$LINENO" 5; } --else -- SYSTEMD_CFLAGS=$pkg_cv_SYSTEMD_CFLAGS -- SYSTEMD_LIBS=$pkg_cv_SYSTEMD_LIBS -- { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 --$as_echo "yes" >&6; } -- --fi -- --elif test $pkg_failed = untried; then -- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 --$as_echo "no" >&6; } -- --pkg_failed=no --{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SYSTEMD" >&5 --$as_echo_n "checking for SYSTEMD... " >&6; } -- --if test -n "$SYSTEMD_CFLAGS"; then -- pkg_cv_SYSTEMD_CFLAGS="$SYSTEMD_CFLAGS" -- elif test -n "$PKG_CONFIG"; then -- if test -n "$PKG_CONFIG" && \ -- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd >= 209\""; } >&5 -- ($PKG_CONFIG --exists --print-errors "libsystemd >= 209") 2>&5 -- ac_status=$? -- $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 -- test $ac_status = 0; }; then -- pkg_cv_SYSTEMD_CFLAGS=`$PKG_CONFIG --cflags "libsystemd >= 209" 2>/dev/null` -- test "x$?" != "x0" && pkg_failed=yes --else -- pkg_failed=yes --fi -- else -- pkg_failed=untried --fi --if test -n "$SYSTEMD_LIBS"; then -- pkg_cv_SYSTEMD_LIBS="$SYSTEMD_LIBS" -- elif test -n "$PKG_CONFIG"; then -- if test -n "$PKG_CONFIG" && \ -- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd >= 209\""; } >&5 -- ($PKG_CONFIG --exists --print-errors "libsystemd >= 209") 2>&5 -- ac_status=$? -- $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 -- test $ac_status = 0; }; then -- pkg_cv_SYSTEMD_LIBS=`$PKG_CONFIG --libs "libsystemd >= 209" 2>/dev/null` -- test "x$?" != "x0" && pkg_failed=yes --else -- pkg_failed=yes --fi -- else -- pkg_failed=untried --fi -- -- -- --if test $pkg_failed = yes; then -- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 --$as_echo "no" >&6; } -- --if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then -- _pkg_short_errors_supported=yes --else -- _pkg_short_errors_supported=no --fi -- if test $_pkg_short_errors_supported = yes; then -- SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libsystemd >= 209" 2>&1` -- else -- SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libsystemd >= 209" 2>&1` -- fi -- # Put the nasty error message in config.log where it belongs -- echo "$SYSTEMD_PKG_ERRORS" >&5 -- -- as_fn_error $? "Package requirements (libsystemd >= 209) were not met: -- --$SYSTEMD_PKG_ERRORS -- --Consider adjusting the PKG_CONFIG_PATH environment variable if you --installed software in a non-standard prefix. -- --Alternatively, you may set the environment variables SYSTEMD_CFLAGS --and SYSTEMD_LIBS to avoid the need to call pkg-config. --See the pkg-config man page for more details." "$LINENO" 5 --elif test $pkg_failed = untried; then -- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 --$as_echo "no" >&6; } -- { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 --$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} --as_fn_error $? "The pkg-config script could not be found or is too old. Make sure it --is in your PATH or set the PKG_CONFIG environment variable to the full --path to pkg-config. -- --Alternatively, you may set the environment variables SYSTEMD_CFLAGS --and SYSTEMD_LIBS to avoid the need to call pkg-config. --See the pkg-config man page for more details. -- --To get pkg-config, see . --See \`config.log' for more details" "$LINENO" 5; } --else -- SYSTEMD_CFLAGS=$pkg_cv_SYSTEMD_CFLAGS -- SYSTEMD_LIBS=$pkg_cv_SYSTEMD_LIBS -- { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 --$as_echo "yes" >&6; } -- --fi -- --else -- SYSTEMD_CFLAGS=$pkg_cv_SYSTEMD_CFLAGS -- SYSTEMD_LIBS=$pkg_cv_SYSTEMD_LIBS -- { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 --$as_echo "yes" >&6; } -- --fi -- -- -- - if test "x$SYSTEMD_DIR" = x; then : - - SYSTEMD_DIR="\$(prefix)/lib/systemd/system/" @@ -17374,6 +16891,18 @@ -fi - - +-for ac_func in pipe2 +-do : +- ac_fn_c_check_func "$LINENO" "pipe2" "ac_cv_func_pipe2" +-if test "x$ac_cv_func_pipe2" = xyes; then : +- cat >>confdefs.h <<_ACEOF +-#define HAVE_PIPE2 1 +-_ACEOF +- +-fi +-done +- +- -cat >confcache <<\_ACEOF -# This file is a shell script that caches the results of configure -# tests run on this system so they can be shared between configure diff -Nru xen-4.17.3+10-g091466ba55/debian/patches/0003-Display-Debian-package-version-in-hypervisor-log.patch xen-4.17.5/debian/patches/0003-Display-Debian-package-version-in-hypervisor-log.patch --- xen-4.17.3+10-g091466ba55/debian/patches/0003-Display-Debian-package-version-in-hypervisor-log.patch 2024-02-04 15:31:59.000000000 +0000 +++ xen-4.17.5/debian/patches/0003-Display-Debian-package-version-in-hypervisor-log.patch 2024-08-18 18:33:38.000000000 +0000 @@ -22,7 +22,7 @@ 6 files changed, 32 insertions(+), 32 deletions(-) diff --git a/xen/build.mk b/xen/build.mk -index 9ecb104..f5d9687 100644 +index b489f77..97e6ad7 100644 --- a/xen/build.mk +++ b/xen/build.mk @@ -17,7 +17,6 @@ targets += .banner diff -Nru xen-4.17.3+10-g091466ba55/debian/patches/0011-config-Tools.mk.in-Respect-caller-s-CONFIG_PV_SHIM.patch xen-4.17.5/debian/patches/0011-config-Tools.mk.in-Respect-caller-s-CONFIG_PV_SHIM.patch --- xen-4.17.3+10-g091466ba55/debian/patches/0011-config-Tools.mk.in-Respect-caller-s-CONFIG_PV_SHIM.patch 2024-02-04 15:31:59.000000000 +0000 +++ xen-4.17.5/debian/patches/0011-config-Tools.mk.in-Respect-caller-s-CONFIG_PV_SHIM.patch 2024-08-18 18:33:38.000000000 +0000 @@ -15,10 +15,10 @@ 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/Tools.mk.in b/config/Tools.mk.in -index 204c79c..605d4ac 100644 +index 28f07ce..ea2f557 100644 --- a/config/Tools.mk.in +++ b/config/Tools.mk.in -@@ -75,4 +75,4 @@ ARGP_LDFLAGS := @argp_ldflags@ +@@ -73,4 +73,4 @@ ARGP_LDFLAGS := @argp_ldflags@ FILE_OFFSET_BITS := @FILE_OFFSET_BITS@ diff -Nru xen-4.17.3+10-g091466ba55/debian/patches/0022-give-meaningful-error-message-if-qemu-device-model-i.patch xen-4.17.5/debian/patches/0022-give-meaningful-error-message-if-qemu-device-model-i.patch --- xen-4.17.3+10-g091466ba55/debian/patches/0022-give-meaningful-error-message-if-qemu-device-model-i.patch 2024-02-04 15:31:59.000000000 +0000 +++ xen-4.17.5/debian/patches/0022-give-meaningful-error-message-if-qemu-device-model-i.patch 2024-08-18 18:33:38.000000000 +0000 @@ -32,10 +32,10 @@ if (b_info->blkdev_start == NULL) diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c -index 14b5931..bb2736e 100644 +index 29b43ed..eca91ca 100644 --- a/tools/libs/light/libxl_dm.c +++ b/tools/libs/light/libxl_dm.c -@@ -2897,6 +2897,9 @@ void libxl__spawn_local_dm(libxl__egc *egc, libxl__dm_spawn_state *dmss) +@@ -2907,6 +2907,9 @@ void libxl__spawn_local_dm(libxl__egc *egc, libxl__dm_spawn_state *dmss) } if (access(dm, X_OK) < 0) { LOGED(ERROR, domid, "device model %s is not executable", dm); diff -Nru xen-4.17.3+10-g091466ba55/docs/misc/xen-command-line.pandoc xen-4.17.5/docs/misc/xen-command-line.pandoc --- xen-4.17.3+10-g091466ba55/docs/misc/xen-command-line.pandoc 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/docs/misc/xen-command-line.pandoc 2024-08-14 09:03:57.000000000 +0000 @@ -1141,7 +1141,8 @@ is for dom0. Changing the setting for domU has no impact on dom0 and vice versa. For example to change dom0 without changing domU, use `extra_guest_irqs=,512`. The default value for Dom0 and an eventual separate -hardware domain is architecture dependent. +hardware domain is architecture dependent. The upper limit for both values on +x86 is such that the resulting total number of IRQs can't be higher than 32768. Note that specifying zero as domU value means zero, while for dom0 it means to use the default. @@ -2324,10 +2325,12 @@ ### spec-ctrl (x86) > `= List of [ , xen=, {pv,hvm}=, -> {msr-sc,rsb,md-clear,ibpb-entry}=|{pv,hvm}=, -> bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd, +> {msr-sc,rsb,verw,{ibpb,bhb}-entry}=|{pv,hvm}=, +> bti-thunk=retpoline|lfence|jmp,bhb-seq=short|tsx|long, +> {ibrs,ibpb,ssbd,psfd, > eager-fpu,l1d-flush,branch-harden,srb-lock, -> unpriv-mmio,gds-mit,div-scrub}= ]` +> unpriv-mmio,gds-mit,div-scrub,lock-harden, +> bhi-dis-s}= ]` Controls for speculative execution sidechannel mitigations. By default, Xen will pick the most appropriate mitigations based on compiled in support, @@ -2349,10 +2352,10 @@ Use of a positive boolean value for either of these options is invalid. -The `pv=`, `hvm=`, `msr-sc=`, `rsb=`, `md-clear=` and `ibpb-entry=` options -offer fine grained control over the primitives by Xen. These impact Xen's -ability to protect itself, and/or Xen's ability to virtualise support for -guests to use. +The `pv=`, `hvm=`, `msr-sc=`, `rsb=`, `verw=`, `ibpb-entry=` and `bhb-entry=` +options offer fine grained control over the primitives by Xen. These impact +Xen's ability to protect itself, and/or Xen's ability to virtualise support +for guests to use. * `pv=` and `hvm=` offer control over all suboptions for PV and HVM guests respectively. @@ -2366,23 +2369,36 @@ guests and if disabled, guests will be unable to use IBRS/STIBP/SSBD/etc. * `rsb=` offers control over whether to overwrite the Return Stack Buffer / Return Address Stack on entry to Xen and on idle. -* `md-clear=` offers control over whether to use VERW to flush - microarchitectural buffers on idle and exit from Xen. *Note: For - compatibility with development versions of this fix, `mds=` is also accepted - on Xen 4.12 and earlier as an alias. Consult vendor documentation in - preference to here.* +* `verw=` offers control over whether to use VERW for its scrubbing side + effects at appropriate privilege transitions. The exact side effects are + microarchitecture and microcode specific. *Note: `md-clear=` is accepted as + a deprecated alias. For compatibility with development versions of XSA-297, + `mds=` is also accepted on Xen 4.12 and earlier as an alias. Consult vendor + documentation in preference to here.* * `ibpb-entry=` offers control over whether IBPB (Indirect Branch Prediction Barrier) is used on entry to Xen. This is used by default on hardware vulnerable to Branch Type Confusion, and hardware vulnerable to Speculative Return Stack Overflow if appropriate microcode has been loaded, but for performance reasons dom0 is unprotected by default. If it is necessary to protect dom0 too, boot with `spec-ctrl=ibpb-entry`. - -If Xen was compiled with INDIRECT_THUNK support, `bti-thunk=` can be used to -select which of the thunks gets patched into the `__x86_indirect_thunk_%reg` -locations. The default thunk is `retpoline` (generally preferred), with the -alternatives being `jmp` (a `jmp *%reg` gadget, minimal overhead), and -`lfence` (an `lfence; jmp *%reg` gadget). +* `bhb-entry=` offers control over whether BHB-clearing (Branch History + Buffer) sequences are used on entry to Xen. This is used by default on + hardware vulnerable to Branch History Injection, when the BHI_DIS_S control + is not available (see `bhi-dis-s`). The choice of scrubbing sequence can be + selected using the `bhb-seq=` option. If it is necessary to protect dom0 + too, boot with `spec-ctrl=bhb-entry`. + +If Xen was compiled with `CONFIG_INDIRECT_THUNK` support, `bti-thunk=` can be +used to select which of the thunks gets patched into the +`__x86_indirect_thunk_%reg` locations. The default thunk is `retpoline` +(generally preferred), with the alternatives being `jmp` (a `jmp *%reg` gadget, +minimal overhead), and `lfence` (an `lfence; jmp *%reg` gadget). + +On all hardware, `bhb-seq=` can be used to select which of the BHB-clearing +sequences gets used. This interacts with the `bhb-entry=` and `bhi-dis-s=` +options in order to mitigate Branch History Injection on affected hardware. +The default sequence is `short`, with `tsx` as an alternative available +capable hardware, and `long` that can be opted in to. On hardware supporting IBRS (Indirect Branch Restricted Speculation), the `ibrs=` option can be used to force or prevent Xen using the feature itself. @@ -2407,6 +2423,11 @@ default, Xen will not use PSFD. PSFD is implied by SSBD, and SSBD is off by default. +On hardware supporting BHI_DIS_S (Branch History Injection Disable +Supervisor), the `bhi-dis-s=` option can be used to force or prevent Xen using +the feature itself. By default Xen will use BHI_DIS_S on hardware susceptible +to Branch History Injection. + On hardware supporting IBPB (Indirect Branch Prediction Barrier), the `ibpb=` option can be used to force (the default) or prevent Xen from issuing branch prediction barriers on vcpu context switches. @@ -2453,6 +2474,11 @@ from mitigating the DIV-leakage vulnerability. By default, Xen will mitigate DIV-leakage on hardware believed to be vulnerable. +If Xen is compiled with `CONFIG_SPECULATIVE_HARDEN_LOCK`, the `lock-harden=` +boolean can be used to force or prevent Xen from using speculation barriers to +protect lock critical regions. This mitigation won't be engaged by default, +and needs to be explicitly enabled on the command line. + ### sync_console > `= ` diff -Nru xen-4.17.3+10-g091466ba55/m4/systemd.m4 xen-4.17.5/m4/systemd.m4 --- xen-4.17.3+10-g091466ba55/m4/systemd.m4 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/m4/systemd.m4 2024-08-14 09:03:57.000000000 +0000 @@ -41,15 +41,6 @@ ]) AC_DEFUN([AX_CHECK_SYSTEMD_LIBS], [ - PKG_CHECK_MODULES([SYSTEMD], [libsystemd-daemon],, - [PKG_CHECK_MODULES([SYSTEMD], [libsystemd >= 209])] - ) - dnl pkg-config older than 0.24 does not set these for - dnl PKG_CHECK_MODULES() worth also noting is that as of version 208 - dnl of systemd pkg-config --cflags currently yields no extra flags yet. - AC_SUBST([SYSTEMD_CFLAGS]) - AC_SUBST([SYSTEMD_LIBS]) - AS_IF([test "x$SYSTEMD_DIR" = x], [ dnl In order to use the line below we need to fix upstream systemd dnl to properly ${prefix} for child variables in @@ -95,13 +86,6 @@ ],[systemd=n]) ]) -AC_DEFUN([AX_CHECK_SYSTEMD_ENABLE_AVAILABLE], [ - PKG_CHECK_MODULES([SYSTEMD], [libsystemd-daemon], [systemd="y"],[ - PKG_CHECK_MODULES([SYSTEMD], [libsystemd >= 209], - [systemd="y"],[systemd="n"]) - ]) -]) - dnl Enables systemd by default and requires a --disable-systemd option flag dnl to configure if you want to disable. AC_DEFUN([AX_ENABLE_SYSTEMD], [ @@ -121,6 +105,5 @@ dnl disable with --disable-systemd AC_DEFUN([AX_AVAILABLE_SYSTEMD], [ AX_ALLOW_SYSTEMD_OPTS() - AX_CHECK_SYSTEMD_ENABLE_AVAILABLE() AX_CHECK_SYSTEMD() ]) diff -Nru xen-4.17.3+10-g091466ba55/tools/config.h.in xen-4.17.5/tools/config.h.in --- xen-4.17.3+10-g091466ba55/tools/config.h.in 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/config.h.in 2024-08-14 09:03:57.000000000 +0000 @@ -39,6 +39,9 @@ /* Define to 1 if you have the header file. */ #undef HAVE_MEMORY_H +/* Define to 1 if you have the `pipe2' function. */ +#undef HAVE_PIPE2 + /* Qemu traditional enabled */ #undef HAVE_QEMU_TRADITIONAL diff -Nru xen-4.17.3+10-g091466ba55/tools/configure xen-4.17.5/tools/configure --- xen-4.17.3+10-g091466ba55/tools/configure 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/configure 2024-08-14 09:03:57.000000000 +0000 @@ -626,8 +626,6 @@ LIBOBJS pvshim ninepfs -SYSTEMD_LIBS -SYSTEMD_CFLAGS SYSTEMD_MODULES_LOAD SYSTEMD_DIR systemd @@ -862,9 +860,7 @@ libzstd_CFLAGS libzstd_LIBS LIBNL3_CFLAGS -LIBNL3_LIBS -SYSTEMD_CFLAGS -SYSTEMD_LIBS' +LIBNL3_LIBS' # Initialize some variables set by options. @@ -1618,10 +1614,6 @@ LIBNL3_CFLAGS C compiler flags for LIBNL3, overriding pkg-config LIBNL3_LIBS linker flags for LIBNL3, overriding pkg-config - SYSTEMD_CFLAGS - C compiler flags for SYSTEMD, overriding pkg-config - SYSTEMD_LIBS - linker flags for SYSTEMD, overriding pkg-config Use these variables to override the choices made by `configure' or to help it to find libraries and programs with nonstandard names/locations. @@ -3889,8 +3881,6 @@ - - test "x$prefix" = "xNONE" && prefix=$ac_default_prefix test "x$exec_prefix" = "xNONE" && exec_prefix=${prefix} @@ -9488,223 +9478,6 @@ - -pkg_failed=no -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SYSTEMD" >&5 -$as_echo_n "checking for SYSTEMD... " >&6; } - -if test -n "$SYSTEMD_CFLAGS"; then - pkg_cv_SYSTEMD_CFLAGS="$SYSTEMD_CFLAGS" - elif test -n "$PKG_CONFIG"; then - if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd-daemon\""; } >&5 - ($PKG_CONFIG --exists --print-errors "libsystemd-daemon") 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; then - pkg_cv_SYSTEMD_CFLAGS=`$PKG_CONFIG --cflags "libsystemd-daemon" 2>/dev/null` - test "x$?" != "x0" && pkg_failed=yes -else - pkg_failed=yes -fi - else - pkg_failed=untried -fi -if test -n "$SYSTEMD_LIBS"; then - pkg_cv_SYSTEMD_LIBS="$SYSTEMD_LIBS" - elif test -n "$PKG_CONFIG"; then - if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd-daemon\""; } >&5 - ($PKG_CONFIG --exists --print-errors "libsystemd-daemon") 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; then - pkg_cv_SYSTEMD_LIBS=`$PKG_CONFIG --libs "libsystemd-daemon" 2>/dev/null` - test "x$?" != "x0" && pkg_failed=yes -else - pkg_failed=yes -fi - else - pkg_failed=untried -fi - - - -if test $pkg_failed = yes; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } - -if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then - _pkg_short_errors_supported=yes -else - _pkg_short_errors_supported=no -fi - if test $_pkg_short_errors_supported = yes; then - SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libsystemd-daemon" 2>&1` - else - SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libsystemd-daemon" 2>&1` - fi - # Put the nasty error message in config.log where it belongs - echo "$SYSTEMD_PKG_ERRORS" >&5 - - - -pkg_failed=no -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SYSTEMD" >&5 -$as_echo_n "checking for SYSTEMD... " >&6; } - -if test -n "$SYSTEMD_CFLAGS"; then - pkg_cv_SYSTEMD_CFLAGS="$SYSTEMD_CFLAGS" - elif test -n "$PKG_CONFIG"; then - if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd >= 209\""; } >&5 - ($PKG_CONFIG --exists --print-errors "libsystemd >= 209") 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; then - pkg_cv_SYSTEMD_CFLAGS=`$PKG_CONFIG --cflags "libsystemd >= 209" 2>/dev/null` - test "x$?" != "x0" && pkg_failed=yes -else - pkg_failed=yes -fi - else - pkg_failed=untried -fi -if test -n "$SYSTEMD_LIBS"; then - pkg_cv_SYSTEMD_LIBS="$SYSTEMD_LIBS" - elif test -n "$PKG_CONFIG"; then - if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd >= 209\""; } >&5 - ($PKG_CONFIG --exists --print-errors "libsystemd >= 209") 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; then - pkg_cv_SYSTEMD_LIBS=`$PKG_CONFIG --libs "libsystemd >= 209" 2>/dev/null` - test "x$?" != "x0" && pkg_failed=yes -else - pkg_failed=yes -fi - else - pkg_failed=untried -fi - - - -if test $pkg_failed = yes; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } - -if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then - _pkg_short_errors_supported=yes -else - _pkg_short_errors_supported=no -fi - if test $_pkg_short_errors_supported = yes; then - SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libsystemd >= 209" 2>&1` - else - SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libsystemd >= 209" 2>&1` - fi - # Put the nasty error message in config.log where it belongs - echo "$SYSTEMD_PKG_ERRORS" >&5 - - systemd="n" -elif test $pkg_failed = untried; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } - systemd="n" -else - SYSTEMD_CFLAGS=$pkg_cv_SYSTEMD_CFLAGS - SYSTEMD_LIBS=$pkg_cv_SYSTEMD_LIBS - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 -$as_echo "yes" >&6; } - systemd="y" -fi - -elif test $pkg_failed = untried; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } - - -pkg_failed=no -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SYSTEMD" >&5 -$as_echo_n "checking for SYSTEMD... " >&6; } - -if test -n "$SYSTEMD_CFLAGS"; then - pkg_cv_SYSTEMD_CFLAGS="$SYSTEMD_CFLAGS" - elif test -n "$PKG_CONFIG"; then - if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd >= 209\""; } >&5 - ($PKG_CONFIG --exists --print-errors "libsystemd >= 209") 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; then - pkg_cv_SYSTEMD_CFLAGS=`$PKG_CONFIG --cflags "libsystemd >= 209" 2>/dev/null` - test "x$?" != "x0" && pkg_failed=yes -else - pkg_failed=yes -fi - else - pkg_failed=untried -fi -if test -n "$SYSTEMD_LIBS"; then - pkg_cv_SYSTEMD_LIBS="$SYSTEMD_LIBS" - elif test -n "$PKG_CONFIG"; then - if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd >= 209\""; } >&5 - ($PKG_CONFIG --exists --print-errors "libsystemd >= 209") 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; then - pkg_cv_SYSTEMD_LIBS=`$PKG_CONFIG --libs "libsystemd >= 209" 2>/dev/null` - test "x$?" != "x0" && pkg_failed=yes -else - pkg_failed=yes -fi - else - pkg_failed=untried -fi - - - -if test $pkg_failed = yes; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } - -if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then - _pkg_short_errors_supported=yes -else - _pkg_short_errors_supported=no -fi - if test $_pkg_short_errors_supported = yes; then - SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libsystemd >= 209" 2>&1` - else - SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libsystemd >= 209" 2>&1` - fi - # Put the nasty error message in config.log where it belongs - echo "$SYSTEMD_PKG_ERRORS" >&5 - - systemd="n" -elif test $pkg_failed = untried; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } - systemd="n" -else - SYSTEMD_CFLAGS=$pkg_cv_SYSTEMD_CFLAGS - SYSTEMD_LIBS=$pkg_cv_SYSTEMD_LIBS - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 -$as_echo "yes" >&6; } - systemd="y" -fi - -else - SYSTEMD_CFLAGS=$pkg_cv_SYSTEMD_CFLAGS - SYSTEMD_LIBS=$pkg_cv_SYSTEMD_LIBS - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 -$as_echo "yes" >&6; } - systemd="y" -fi - - if test "x$enable_systemd" != "xno"; then : if test "x$systemd" = "xy" ; then : @@ -9714,262 +9487,6 @@ systemd=y - -pkg_failed=no -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SYSTEMD" >&5 -$as_echo_n "checking for SYSTEMD... " >&6; } - -if test -n "$SYSTEMD_CFLAGS"; then - pkg_cv_SYSTEMD_CFLAGS="$SYSTEMD_CFLAGS" - elif test -n "$PKG_CONFIG"; then - if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd-daemon\""; } >&5 - ($PKG_CONFIG --exists --print-errors "libsystemd-daemon") 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; then - pkg_cv_SYSTEMD_CFLAGS=`$PKG_CONFIG --cflags "libsystemd-daemon" 2>/dev/null` - test "x$?" != "x0" && pkg_failed=yes -else - pkg_failed=yes -fi - else - pkg_failed=untried -fi -if test -n "$SYSTEMD_LIBS"; then - pkg_cv_SYSTEMD_LIBS="$SYSTEMD_LIBS" - elif test -n "$PKG_CONFIG"; then - if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd-daemon\""; } >&5 - ($PKG_CONFIG --exists --print-errors "libsystemd-daemon") 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; then - pkg_cv_SYSTEMD_LIBS=`$PKG_CONFIG --libs "libsystemd-daemon" 2>/dev/null` - test "x$?" != "x0" && pkg_failed=yes -else - pkg_failed=yes -fi - else - pkg_failed=untried -fi - - - -if test $pkg_failed = yes; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } - -if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then - _pkg_short_errors_supported=yes -else - _pkg_short_errors_supported=no -fi - if test $_pkg_short_errors_supported = yes; then - SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libsystemd-daemon" 2>&1` - else - SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libsystemd-daemon" 2>&1` - fi - # Put the nasty error message in config.log where it belongs - echo "$SYSTEMD_PKG_ERRORS" >&5 - - -pkg_failed=no -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SYSTEMD" >&5 -$as_echo_n "checking for SYSTEMD... " >&6; } - -if test -n "$SYSTEMD_CFLAGS"; then - pkg_cv_SYSTEMD_CFLAGS="$SYSTEMD_CFLAGS" - elif test -n "$PKG_CONFIG"; then - if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd >= 209\""; } >&5 - ($PKG_CONFIG --exists --print-errors "libsystemd >= 209") 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; then - pkg_cv_SYSTEMD_CFLAGS=`$PKG_CONFIG --cflags "libsystemd >= 209" 2>/dev/null` - test "x$?" != "x0" && pkg_failed=yes -else - pkg_failed=yes -fi - else - pkg_failed=untried -fi -if test -n "$SYSTEMD_LIBS"; then - pkg_cv_SYSTEMD_LIBS="$SYSTEMD_LIBS" - elif test -n "$PKG_CONFIG"; then - if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd >= 209\""; } >&5 - ($PKG_CONFIG --exists --print-errors "libsystemd >= 209") 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; then - pkg_cv_SYSTEMD_LIBS=`$PKG_CONFIG --libs "libsystemd >= 209" 2>/dev/null` - test "x$?" != "x0" && pkg_failed=yes -else - pkg_failed=yes -fi - else - pkg_failed=untried -fi - - - -if test $pkg_failed = yes; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } - -if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then - _pkg_short_errors_supported=yes -else - _pkg_short_errors_supported=no -fi - if test $_pkg_short_errors_supported = yes; then - SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libsystemd >= 209" 2>&1` - else - SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libsystemd >= 209" 2>&1` - fi - # Put the nasty error message in config.log where it belongs - echo "$SYSTEMD_PKG_ERRORS" >&5 - - as_fn_error $? "Package requirements (libsystemd >= 209) were not met: - -$SYSTEMD_PKG_ERRORS - -Consider adjusting the PKG_CONFIG_PATH environment variable if you -installed software in a non-standard prefix. - -Alternatively, you may set the environment variables SYSTEMD_CFLAGS -and SYSTEMD_LIBS to avoid the need to call pkg-config. -See the pkg-config man page for more details." "$LINENO" 5 -elif test $pkg_failed = untried; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } - { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 -$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} -as_fn_error $? "The pkg-config script could not be found or is too old. Make sure it -is in your PATH or set the PKG_CONFIG environment variable to the full -path to pkg-config. - -Alternatively, you may set the environment variables SYSTEMD_CFLAGS -and SYSTEMD_LIBS to avoid the need to call pkg-config. -See the pkg-config man page for more details. - -To get pkg-config, see . -See \`config.log' for more details" "$LINENO" 5; } -else - SYSTEMD_CFLAGS=$pkg_cv_SYSTEMD_CFLAGS - SYSTEMD_LIBS=$pkg_cv_SYSTEMD_LIBS - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 -$as_echo "yes" >&6; } - -fi - -elif test $pkg_failed = untried; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } - -pkg_failed=no -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SYSTEMD" >&5 -$as_echo_n "checking for SYSTEMD... " >&6; } - -if test -n "$SYSTEMD_CFLAGS"; then - pkg_cv_SYSTEMD_CFLAGS="$SYSTEMD_CFLAGS" - elif test -n "$PKG_CONFIG"; then - if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd >= 209\""; } >&5 - ($PKG_CONFIG --exists --print-errors "libsystemd >= 209") 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; then - pkg_cv_SYSTEMD_CFLAGS=`$PKG_CONFIG --cflags "libsystemd >= 209" 2>/dev/null` - test "x$?" != "x0" && pkg_failed=yes -else - pkg_failed=yes -fi - else - pkg_failed=untried -fi -if test -n "$SYSTEMD_LIBS"; then - pkg_cv_SYSTEMD_LIBS="$SYSTEMD_LIBS" - elif test -n "$PKG_CONFIG"; then - if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libsystemd >= 209\""; } >&5 - ($PKG_CONFIG --exists --print-errors "libsystemd >= 209") 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; then - pkg_cv_SYSTEMD_LIBS=`$PKG_CONFIG --libs "libsystemd >= 209" 2>/dev/null` - test "x$?" != "x0" && pkg_failed=yes -else - pkg_failed=yes -fi - else - pkg_failed=untried -fi - - - -if test $pkg_failed = yes; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } - -if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then - _pkg_short_errors_supported=yes -else - _pkg_short_errors_supported=no -fi - if test $_pkg_short_errors_supported = yes; then - SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libsystemd >= 209" 2>&1` - else - SYSTEMD_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libsystemd >= 209" 2>&1` - fi - # Put the nasty error message in config.log where it belongs - echo "$SYSTEMD_PKG_ERRORS" >&5 - - as_fn_error $? "Package requirements (libsystemd >= 209) were not met: - -$SYSTEMD_PKG_ERRORS - -Consider adjusting the PKG_CONFIG_PATH environment variable if you -installed software in a non-standard prefix. - -Alternatively, you may set the environment variables SYSTEMD_CFLAGS -and SYSTEMD_LIBS to avoid the need to call pkg-config. -See the pkg-config man page for more details." "$LINENO" 5 -elif test $pkg_failed = untried; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } - { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 -$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} -as_fn_error $? "The pkg-config script could not be found or is too old. Make sure it -is in your PATH or set the PKG_CONFIG environment variable to the full -path to pkg-config. - -Alternatively, you may set the environment variables SYSTEMD_CFLAGS -and SYSTEMD_LIBS to avoid the need to call pkg-config. -See the pkg-config man page for more details. - -To get pkg-config, see . -See \`config.log' for more details" "$LINENO" 5; } -else - SYSTEMD_CFLAGS=$pkg_cv_SYSTEMD_CFLAGS - SYSTEMD_LIBS=$pkg_cv_SYSTEMD_LIBS - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 -$as_echo "yes" >&6; } - -fi - -else - SYSTEMD_CFLAGS=$pkg_cv_SYSTEMD_CFLAGS - SYSTEMD_LIBS=$pkg_cv_SYSTEMD_LIBS - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 -$as_echo "yes" >&6; } - -fi - - - if test "x$SYSTEMD_DIR" = x; then : SYSTEMD_DIR="\$(prefix)/lib/systemd/system/" @@ -10182,6 +9699,18 @@ fi +for ac_func in pipe2 +do : + ac_fn_c_check_func "$LINENO" "pipe2" "ac_cv_func_pipe2" +if test "x$ac_cv_func_pipe2" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_PIPE2 1 +_ACEOF + +fi +done + + cat >confcache <<\_ACEOF # This file is a shell script that caches the results of configure # tests run on this system so they can be shared between configure diff -Nru xen-4.17.3+10-g091466ba55/tools/configure.ac xen-4.17.5/tools/configure.ac --- xen-4.17.3+10-g091466ba55/tools/configure.ac 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/configure.ac 2024-08-14 09:03:57.000000000 +0000 @@ -518,4 +518,6 @@ AX_FIND_HEADER([INCLUDE_ENDIAN_H], [endian.h sys/endian.h]) +AC_CHECK_FUNCS([pipe2]) + AC_OUTPUT() diff -Nru xen-4.17.3+10-g091466ba55/tools/firmware/etherboot/Makefile xen-4.17.5/tools/firmware/etherboot/Makefile --- xen-4.17.3+10-g091466ba55/tools/firmware/etherboot/Makefile 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/firmware/etherboot/Makefile 2024-08-14 09:03:57.000000000 +0000 @@ -11,7 +11,7 @@ endif # put an updated tar.gz on xenbits after changes to this variable -IPXE_GIT_TAG := 3c040ad387099483102708bb1839110bc788cefb +IPXE_GIT_TAG := 1d1cf74a5e58811822bee4b3da3cff7282fcdfca IPXE_TARBALL_URL ?= $(XEN_EXTFILES_URL)/ipxe-git-$(IPXE_GIT_TAG).tar.gz diff -Nru xen-4.17.3+10-g091466ba55/tools/firmware/hvmloader/pci.c xen-4.17.5/tools/firmware/hvmloader/pci.c --- xen-4.17.3+10-g091466ba55/tools/firmware/hvmloader/pci.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/firmware/hvmloader/pci.c 2024-08-14 09:03:57.000000000 +0000 @@ -33,6 +33,13 @@ const uint32_t pci_mem_end = RESERVED_MEMBASE; uint64_t pci_hi_mem_start = 0, pci_hi_mem_end = 0; +/* + * BARs larger than this value are put in 64-bit space unconditionally. That + * is, such BARs also don't play into the determination of how big the lowmem + * MMIO hole needs to be. + */ +#define BAR_RELOC_THRESH GB(1) + enum virtual_vga virtual_vga = VGA_none; unsigned long igd_opregion_pgbase = 0; @@ -286,9 +293,11 @@ bars[i].bar_reg = bar_reg; bars[i].bar_sz = bar_sz; - if ( ((bar_data & PCI_BASE_ADDRESS_SPACE) == - PCI_BASE_ADDRESS_SPACE_MEMORY) || - (bar_reg == PCI_ROM_ADDRESS) ) + if ( is_64bar && bar_sz > BAR_RELOC_THRESH ) + bar64_relocate = 1; + else if ( ((bar_data & PCI_BASE_ADDRESS_SPACE) == + PCI_BASE_ADDRESS_SPACE_MEMORY) || + (bar_reg == PCI_ROM_ADDRESS) ) mmio_total += bar_sz; nr_bars++; @@ -367,7 +376,7 @@ pci_mem_start = hvm_info->low_mem_pgend << PAGE_SHIFT; } - if ( mmio_total > (pci_mem_end - pci_mem_start) ) + if ( mmio_total > (pci_mem_end - pci_mem_start) || bar64_relocate ) { printf("Low MMIO hole not large enough for all devices," " relocating some BARs to 64-bit\n"); @@ -430,7 +439,8 @@ /* * Relocate to high memory if the total amount of MMIO needed - * is more than the low MMIO available. Because devices are + * is more than the low MMIO available or BARs bigger than + * BAR_RELOC_THRESH are present. Because devices are * processed in order of bar_sz, this will preferentially * relocate larger devices to high memory first. * @@ -446,8 +456,9 @@ * the code here assumes it to be.) * Should either of those two conditions change, this code will break. */ - using_64bar = bars[i].is_64bar && bar64_relocate - && (mmio_total > (mem_resource.max - mem_resource.base)); + using_64bar = bars[i].is_64bar && bar64_relocate && + (mmio_total > (mem_resource.max - mem_resource.base) || + bar_sz > BAR_RELOC_THRESH); bar_data = pci_readl(devfn, bar_reg); if ( (bar_data & PCI_BASE_ADDRESS_SPACE) == @@ -467,7 +478,8 @@ resource = &mem_resource; bar_data &= ~PCI_BASE_ADDRESS_MEM_MASK; } - mmio_total -= bar_sz; + if ( bar_sz <= BAR_RELOC_THRESH ) + mmio_total -= bar_sz; } else { diff -Nru xen-4.17.3+10-g091466ba55/tools/hotplug/Linux/block-common.sh xen-4.17.5/tools/hotplug/Linux/block-common.sh --- xen-4.17.3+10-g091466ba55/tools/hotplug/Linux/block-common.sh 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/hotplug/Linux/block-common.sh 2024-08-14 09:03:57.000000000 +0000 @@ -112,14 +112,12 @@ "$FRONTEND_UUID") local target=$(xenstore_read_default "/local/domain/$FRONTEND_ID/target" \ "-1") - local targetvm=$(xenstore_read_default "/local/domain/$target/vm" "-1") + local targetvm=$(xenstore_read_default "/local/domain/$target/vm" "No Target") local otarget=$(xenstore_read_default "/local/domain/$otherdom/target" \ "-1") local otvm=$(xenstore_read_default "/local/domain/$otarget/vm" \ - "-1") - otvm=${otvm%-1} - othervm=${othervm%-1} - targetvm=${targetvm%-1} + "No Other Target") + local frontend_uuid=${FRONTEND_UUID%-1} [ "$frontend_uuid" = "$othervm" -o "$targetvm" = "$othervm" -o \ diff -Nru xen-4.17.3+10-g091466ba55/tools/include/xen-sd-notify.h xen-4.17.5/tools/include/xen-sd-notify.h --- xen-4.17.3+10-g091466ba55/tools/include/xen-sd-notify.h 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.17.5/tools/include/xen-sd-notify.h 2024-08-14 09:03:57.000000000 +0000 @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: MIT-0 */ + +/* + * Implement the systemd notify protocol without external dependencies. + * Supports both readiness notification on startup and on reloading, + * according to the protocol defined at: + * https://www.freedesktop.org/software/systemd/man/latest/sd_notify.html + * This protocol is guaranteed to be stable as per: + * https://systemd.io/PORTABILITY_AND_STABILITY/ + * + * Differences from the upstream copy: + * - Rename/rework as a drop-in replacement for systemd/sd-daemon.h + * - Only take the subset Xen cares about + * - Respect -Wdeclaration-after-statement + */ + +#ifndef XEN_SD_NOTIFY +#define XEN_SD_NOTIFY + +#include +#include +#include +#include +#include +#include + +static inline void xen_sd_closep(int *fd) { + if (!fd || *fd < 0) + return; + + close(*fd); + *fd = -1; +} + +static inline int xen_sd_notify(const char *message) { + union sockaddr_union { + struct sockaddr sa; + struct sockaddr_un sun; + } socket_addr = { + .sun.sun_family = AF_UNIX, + }; + size_t path_length, message_length; + ssize_t written; + const char *socket_path; + int __attribute__((cleanup(xen_sd_closep))) fd = -1; + + /* Verify the argument first */ + if (!message) + return -EINVAL; + + message_length = strlen(message); + if (message_length == 0) + return -EINVAL; + + /* If the variable is not set, the protocol is a noop */ + socket_path = getenv("NOTIFY_SOCKET"); + if (!socket_path) + return 0; /* Not set? Nothing to do */ + + /* Only AF_UNIX is supported, with path or abstract sockets */ + if (socket_path[0] != '/' && socket_path[0] != '@') + return -EAFNOSUPPORT; + + path_length = strlen(socket_path); + /* Ensure there is room for NUL byte */ + if (path_length >= sizeof(socket_addr.sun.sun_path)) + return -E2BIG; + + memcpy(socket_addr.sun.sun_path, socket_path, path_length); + + /* Support for abstract socket */ + if (socket_addr.sun.sun_path[0] == '@') + socket_addr.sun.sun_path[0] = 0; + + fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0); + if (fd < 0) + return -errno; + + if (connect(fd, &socket_addr.sa, offsetof(struct sockaddr_un, sun_path) + path_length) != 0) + return -errno; + + written = write(fd, message, message_length); + if (written != (ssize_t) message_length) + return written < 0 ? -errno : -EPROTO; + + return 1; /* Notified! */ +} + +static inline int sd_notify(int unset_environment, const char *message) { + int r = xen_sd_notify(message); + + if (unset_environment) + unsetenv("NOTIFY_SOCKET"); + + return r; +} + +#endif /* XEN_SD_NOTIFY */ diff -Nru xen-4.17.3+10-g091466ba55/tools/libs/guest/xg_dom_core.c xen-4.17.5/tools/libs/guest/xg_dom_core.c --- xen-4.17.3+10-g091466ba55/tools/libs/guest/xg_dom_core.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/libs/guest/xg_dom_core.c 2024-08-14 09:03:57.000000000 +0000 @@ -601,7 +601,7 @@ memset(ptr, 0, pages * page_size); seg->vstart = start; - seg->vend = dom->virt_alloc_end; + seg->vend = start + size; DOMPRINTF("%-20s: %-12s : 0x%" PRIx64 " -> 0x%" PRIx64 " (pfn 0x%" PRIpfn " + 0x%" PRIpfn " pages)", diff -Nru xen-4.17.3+10-g091466ba55/tools/libs/light/libxl_console.c xen-4.17.5/tools/libs/light/libxl_console.c --- xen-4.17.3+10-g091466ba55/tools/libs/light/libxl_console.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/libs/light/libxl_console.c 2024-08-14 09:03:57.000000000 +0000 @@ -337,11 +337,10 @@ flexarray_append(front, "protocol"); flexarray_append(front, LIBXL_XENCONSOLE_PROTOCOL); } - libxl__device_generic_add(gc, XBT_NULL, device, - libxl__xs_kvs_of_flexarray(gc, back), - libxl__xs_kvs_of_flexarray(gc, front), - libxl__xs_kvs_of_flexarray(gc, ro_front)); - rc = 0; + rc = libxl__device_generic_add(gc, XBT_NULL, device, + libxl__xs_kvs_of_flexarray(gc, back), + libxl__xs_kvs_of_flexarray(gc, front), + libxl__xs_kvs_of_flexarray(gc, ro_front)); out: return rc; } @@ -628,6 +627,8 @@ */ if (!val) val = "/NO-SUCH-PATH"; channelinfo->u.pty.path = strdup(val); + if (channelinfo->u.pty.path == NULL) + abort(); break; default: break; diff -Nru xen-4.17.3+10-g091466ba55/tools/libs/light/libxl_device.c xen-4.17.5/tools/libs/light/libxl_device.c --- xen-4.17.3+10-g091466ba55/tools/libs/light/libxl_device.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/libs/light/libxl_device.c 2024-08-14 09:03:57.000000000 +0000 @@ -177,8 +177,13 @@ ro_frontend_perms[1].perms = backend_perms[1].perms = XS_PERM_READ; retry_transaction: - if (create_transaction) + if (create_transaction) { t = xs_transaction_start(ctx->xsh); + if (t == XBT_NULL) { + LOGED(ERROR, device->domid, "xs_transaction_start failed"); + return ERROR_FAIL; + } + } /* FIXME: read frontend_path and check state before removing stuff */ @@ -195,42 +200,55 @@ if (rc) goto out; } - /* xxx much of this function lacks error checks! */ - if (fents || ro_fents) { - xs_rm(ctx->xsh, t, frontend_path); - xs_mkdir(ctx->xsh, t, frontend_path); + if (!xs_rm(ctx->xsh, t, frontend_path) && errno != ENOENT) + goto out; + if (!xs_mkdir(ctx->xsh, t, frontend_path)) + goto out; /* Console 0 is a special case. It doesn't use the regular PV * state machine but also the frontend directory has * historically contained other information, such as the * vnc-port, which we don't want the guest fiddling with. */ if ((device->kind == LIBXL__DEVICE_KIND_CONSOLE && device->devid == 0) || - (device->kind == LIBXL__DEVICE_KIND_VUART)) - xs_set_permissions(ctx->xsh, t, frontend_path, - ro_frontend_perms, ARRAY_SIZE(ro_frontend_perms)); - else - xs_set_permissions(ctx->xsh, t, frontend_path, - frontend_perms, ARRAY_SIZE(frontend_perms)); - xs_write(ctx->xsh, t, GCSPRINTF("%s/backend", frontend_path), - backend_path, strlen(backend_path)); - if (fents) - libxl__xs_writev_perms(gc, t, frontend_path, fents, - frontend_perms, ARRAY_SIZE(frontend_perms)); - if (ro_fents) - libxl__xs_writev_perms(gc, t, frontend_path, ro_fents, - ro_frontend_perms, ARRAY_SIZE(ro_frontend_perms)); + (device->kind == LIBXL__DEVICE_KIND_VUART)) { + if (!xs_set_permissions(ctx->xsh, t, frontend_path, + ro_frontend_perms, ARRAY_SIZE(ro_frontend_perms))) + goto out; + } else { + if (!xs_set_permissions(ctx->xsh, t, frontend_path, + frontend_perms, ARRAY_SIZE(frontend_perms))) + goto out; + } + if (!xs_write(ctx->xsh, t, GCSPRINTF("%s/backend", frontend_path), + backend_path, strlen(backend_path))) + goto out; + if (fents) { + rc = libxl__xs_writev_perms(gc, t, frontend_path, fents, + frontend_perms, ARRAY_SIZE(frontend_perms)); + if (rc) goto out; + } + if (ro_fents) { + rc = libxl__xs_writev_perms(gc, t, frontend_path, ro_fents, + ro_frontend_perms, ARRAY_SIZE(ro_frontend_perms)); + if (rc) goto out; + } } if (bents) { if (!libxl_only) { - xs_rm(ctx->xsh, t, backend_path); - xs_mkdir(ctx->xsh, t, backend_path); - xs_set_permissions(ctx->xsh, t, backend_path, backend_perms, - ARRAY_SIZE(backend_perms)); - xs_write(ctx->xsh, t, GCSPRINTF("%s/frontend", backend_path), - frontend_path, strlen(frontend_path)); - libxl__xs_writev(gc, t, backend_path, bents); + if (!xs_rm(ctx->xsh, t, backend_path) && errno != ENOENT) + goto out; + if (!xs_mkdir(ctx->xsh, t, backend_path)) + goto out; + if (!xs_set_permissions(ctx->xsh, t, backend_path, backend_perms, + ARRAY_SIZE(backend_perms))) + goto out; + if (!xs_write(ctx->xsh, t, GCSPRINTF("%s/frontend", backend_path), + frontend_path, strlen(frontend_path))) + goto out; + rc = libxl__xs_writev(gc, t, backend_path, bents); + if (rc) goto out; } /* @@ -276,7 +294,7 @@ out: if (create_transaction && t) libxl__xs_transaction_abort(gc, &t); - return rc; + return rc != 0 ? rc : ERROR_FAIL; } typedef struct { diff -Nru xen-4.17.3+10-g091466ba55/tools/libs/light/libxl_dm.c xen-4.17.5/tools/libs/light/libxl_dm.c --- xen-4.17.3+10-g091466ba55/tools/libs/light/libxl_dm.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/libs/light/libxl_dm.c 2024-08-14 09:03:57.000000000 +0000 @@ -2432,6 +2432,16 @@ "%s", libxl_bios_type_to_string(guest_config->b_info.u.hvm.bios)); } + /* Disable relocating memory to make the MMIO hole larger + * unless we're running qemu-traditional and vNUMA is not + * configured. */ + libxl__xs_printf(gc, XBT_NULL, + libxl__sprintf(gc, "%s/hvmloader/allow-memory-relocate", + libxl__xs_get_dompath(gc, guest_domid)), + "%d", + guest_config->b_info.device_model_version + == LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL && + !libxl__vnuma_configured(&guest_config->b_info)); ret = xc_domain_set_target(ctx->xch, dm_domid, guest_domid); if (ret<0) { LOGED(ERROR, guest_domid, "setting target domain %d -> %d", @@ -3162,8 +3172,8 @@ /* Check if spawn failed */ if (rc) goto out; - - if (d_config->b_info.device_model_version + /* d_config is NULL for xl devd/libxl__spawn_qemu_xenpv_backend(). */ + if (d_config && d_config->b_info.device_model_version == LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN) { rc = libxl__ev_time_register_rel(ao, &dmss->timeout, devise_model_postconfig_timeout, diff -Nru xen-4.17.3+10-g091466ba55/tools/libs/light/libxl_x86_acpi.c xen-4.17.5/tools/libs/light/libxl_x86_acpi.c --- xen-4.17.3+10-g091466ba55/tools/libs/light/libxl_x86_acpi.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/libs/light/libxl_x86_acpi.c 2024-08-14 09:03:57.000000000 +0000 @@ -89,7 +89,7 @@ uint32_t domid = dom->guest_domid; xc_dominfo_t info; struct hvm_info_table *hvminfo; - int i, r, rc; + int r, rc; config->dsdt_anycpu = config->dsdt_15cpu = dsdt_pvh; config->dsdt_anycpu_len = config->dsdt_15cpu_len = dsdt_pvh_len; @@ -138,8 +138,8 @@ hvminfo->nr_vcpus = info.max_vcpu_id + 1; } - for (i = 0; i < hvminfo->nr_vcpus; i++) - hvminfo->vcpu_online[i / 8] |= 1 << (i & 7); + memcpy(hvminfo->vcpu_online, b_info->avail_vcpus.map, + b_info->avail_vcpus.size); config->hvminfo = hvminfo; diff -Nru xen-4.17.3+10-g091466ba55/tools/libs/light/libxl_xshelp.c xen-4.17.5/tools/libs/light/libxl_xshelp.c --- xen-4.17.3+10-g091466ba55/tools/libs/light/libxl_xshelp.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/libs/light/libxl_xshelp.c 2024-08-14 09:03:57.000000000 +0000 @@ -60,10 +60,15 @@ for (i = 0; kvs[i] != NULL; i += 2) { path = GCSPRINTF("%s/%s", dir, kvs[i]); if (path && kvs[i + 1]) { - int length = strlen(kvs[i + 1]); - xs_write(ctx->xsh, t, path, kvs[i + 1], length); - if (perms) - xs_set_permissions(ctx->xsh, t, path, perms, num_perms); + size_t length = strlen(kvs[i + 1]); + if (length > UINT_MAX) + return ERROR_FAIL; + if (!xs_write(ctx->xsh, t, path, kvs[i + 1], length)) + return ERROR_FAIL; + if (perms) { + if (!xs_set_permissions(ctx->xsh, t, path, perms, num_perms)) + return ERROR_FAIL; + } } } return 0; diff -Nru xen-4.17.3+10-g091466ba55/tools/libs/store/xs.c xen-4.17.5/tools/libs/store/xs.c --- xen-4.17.3+10-g091466ba55/tools/libs/store/xs.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/libs/store/xs.c 2024-08-14 09:03:57.000000000 +0000 @@ -40,6 +40,14 @@ #include +#ifndef O_CLOEXEC +#define O_CLOEXEC 0 +#endif + +#ifndef SOCK_CLOEXEC +#define SOCK_CLOEXEC 0 +#endif + struct xs_stored_msg { struct list_head list; struct xsd_sockmsg hdr; @@ -172,13 +180,37 @@ return true; } +static bool set_cloexec(int fd) +{ + int flags = fcntl(fd, F_GETFD); + + if (flags < 0) + return false; + + return fcntl(fd, F_SETFD, flags | FD_CLOEXEC) >= 0; +} + +static int pipe_cloexec(int fds[2]) +{ +#if HAVE_PIPE2 + return pipe2(fds, O_CLOEXEC); +#else + if (pipe(fds) < 0) + return -1; + /* Best effort to set CLOEXEC. Racy. */ + set_cloexec(fds[0]); + set_cloexec(fds[1]); + return 0; +#endif +} + int xs_fileno(struct xs_handle *h) { char c = 0; mutex_lock(&h->watch_mutex); - if ((h->watch_pipe[0] == -1) && (pipe(h->watch_pipe) != -1)) { + if ((h->watch_pipe[0] == -1) && (pipe_cloexec(h->watch_pipe) != -1)) { /* Kick things off if the watch list is already non-empty. */ if (!list_empty(&h->watch_list)) while (write(h->watch_pipe[1], &c, 1) != 1) @@ -193,16 +225,14 @@ static int get_socket(const char *connect_to) { struct sockaddr_un addr; - int sock, saved_errno, flags; + int sock, saved_errno; - sock = socket(PF_UNIX, SOCK_STREAM, 0); + sock = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); if (sock < 0) return -1; - if ((flags = fcntl(sock, F_GETFD)) < 0) - goto error; - flags |= FD_CLOEXEC; - if (fcntl(sock, F_SETFD, flags) < 0) + /* Compat for non-SOCK_CLOEXEC environments. Racy. */ + if (!SOCK_CLOEXEC && !set_cloexec(sock)) goto error; addr.sun_family = AF_UNIX; @@ -226,8 +256,24 @@ static int get_dev(const char *connect_to) { - /* We cannot open read-only because requests are writes */ - return open(connect_to, O_RDWR); + int fd, saved_errno; + + fd = open(connect_to, O_RDWR | O_CLOEXEC); + if (fd < 0) + return -1; + + /* Compat for non-O_CLOEXEC environments. Racy. */ + if (!O_CLOEXEC && !set_cloexec(fd)) + goto error; + + return fd; + +error: + saved_errno = errno; + close(fd); + errno = saved_errno; + + return -1; } static int all_restrict_cb(Xentoolcore__Active_Handle *ah, domid_t domid) { diff -Nru xen-4.17.3+10-g091466ba55/tools/misc/xen-cpuid.c xen-4.17.5/tools/misc/xen-cpuid.c --- xen-4.17.3+10-g091466ba55/tools/misc/xen-cpuid.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/misc/xen-cpuid.c 2024-08-14 09:03:57.000000000 +0000 @@ -172,7 +172,7 @@ [ 8] = "avx512-vp2intersect", [ 9] = "srbds-ctrl", [10] = "md-clear", [11] = "rtm-always-abort", /* 12 */ [13] = "tsx-force-abort", - [14] = "serialize", + [14] = "serialize", [15] = "hybrid", [16] = "tsxldtrk", [18] = "pconfig", [20] = "cet-ibt", @@ -237,7 +237,8 @@ [20] = "bhi-no", [21] = "xapic-status", /* 22 */ [23] = "ovrclk-status", [24] = "pbrsb-no", [25] = "gds-ctrl", - [26] = "gds-no", + [26] = "gds-no", [27] = "rfds-no", + [28] = "rfds-clear", }; static const char *const str_m10Ah[32] = diff -Nru xen-4.17.3+10-g091466ba55/tools/misc/xen-ucode.c xen-4.17.5/tools/misc/xen-ucode.c --- xen-4.17.3+10-g091466ba55/tools/misc/xen-ucode.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/misc/xen-ucode.c 2024-08-14 09:03:57.000000000 +0000 @@ -60,8 +60,11 @@ exit(1); } + errno = 0; ret = xc_microcode_update(xch, buf, len); - if ( ret ) + if ( ret == -1 && errno == EEXIST ) + printf("Microcode already up to date\n"); + else if ( ret ) { fprintf(stderr, "Failed to update microcode. (err: %s)\n", strerror(errno)); diff -Nru xen-4.17.3+10-g091466ba55/tools/ocaml/xenstored/Makefile xen-4.17.5/tools/ocaml/xenstored/Makefile --- xen-4.17.3+10-g091466ba55/tools/ocaml/xenstored/Makefile 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/ocaml/xenstored/Makefile 2024-08-14 09:03:57.000000000 +0000 @@ -4,8 +4,7 @@ # Include configure output (config.h) CFLAGS += -include $(XEN_ROOT)/tools/config.h -CFLAGS-$(CONFIG_SYSTEMD) += $(SYSTEMD_CFLAGS) -LDFLAGS-$(CONFIG_SYSTEMD) += $(SYSTEMD_LIBS) +CFLAGS-$(CONFIG_SYSTEMD) += $(CFLAGS_xeninclude) CFLAGS += $(CFLAGS-y) CFLAGS += $(APPEND_CFLAGS) diff -Nru xen-4.17.3+10-g091466ba55/tools/ocaml/xenstored/quota.ml xen-4.17.5/tools/ocaml/xenstored/quota.ml --- xen-4.17.3+10-g091466ba55/tools/ocaml/xenstored/quota.ml 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/ocaml/xenstored/quota.ml 2024-08-14 09:03:57.000000000 +0000 @@ -23,66 +23,69 @@ let maxent = ref (1000) let maxsize = ref (2048) +module Domid = struct + type t = Xenctrl.domid + let compare (a:t) (b:t) = compare a b +end + +module DomidMap = Map.Make(Domid) + type t = { maxent: int; (* max entities per domU *) maxsize: int; (* max size of data store in one node *) - cur: (Xenctrl.domid, int) Hashtbl.t; (* current domains quota *) + cur: int DomidMap.t; (* current domains quota *) } let to_string quota domid = - if Hashtbl.mem quota.cur domid - then Printf.sprintf "dom%i quota: %i/%i" domid (Hashtbl.find quota.cur domid) quota.maxent - else Printf.sprintf "dom%i quota: not set" domid + try + Printf.sprintf "dom%i quota: %i/%i" domid (DomidMap.find domid quota.cur) quota.maxent + with Not_found -> + Printf.sprintf "dom%i quota: not set" domid let create () = - { maxent = !maxent; maxsize = !maxsize; cur = Hashtbl.create 100; } + { maxent = !maxent; maxsize = !maxsize; cur = DomidMap.empty; } -let copy quota = { quota with cur = (Hashtbl.copy quota.cur) } +let copy quota = { quota with cur = quota.cur } -let del quota id = Hashtbl.remove quota.cur id +let del quota id = { quota with cur = DomidMap.remove id quota.cur } let _check quota id size = if size > quota.maxsize then ( warn "domain %u err create entry: data too big %d" id size; raise Data_too_big ); - if id > 0 && Hashtbl.mem quota.cur id then - let entry = Hashtbl.find quota.cur id in + if id > 0 then + try + let entry = DomidMap.find id quota.cur in if entry >= quota.maxent then ( warn "domain %u cannot create entry: quota reached" id; raise Limit_reached ) + with Not_found -> () let check quota id size = if !activate then _check quota id size -let get_entry quota id = Hashtbl.find quota.cur id +let find_or_zero quota_cur id = + try DomidMap.find id quota_cur with Not_found -> 0 -let set_entry quota id nb = - if nb = 0 - then Hashtbl.remove quota.cur id - else begin - if Hashtbl.mem quota.cur id then - Hashtbl.replace quota.cur id nb - else - Hashtbl.add quota.cur id nb - end +let update_entry quota_cur id diff = + let nb = diff + find_or_zero quota_cur id in + if nb = 0 then DomidMap.remove id quota_cur + else DomidMap.add id nb quota_cur let del_entry quota id = - try - let nb = get_entry quota id in - set_entry quota id (nb - 1) - with Not_found -> () + {quota with cur = update_entry quota.cur id (-1)} let add_entry quota id = - let nb = try get_entry quota id with Not_found -> 0 in - set_entry quota id (nb + 1) - -let add quota diff = - Hashtbl.iter (fun id nb -> set_entry quota id (get_entry quota id + nb)) diff.cur + {quota with cur = update_entry quota.cur id (+1)} let merge orig_quota mod_quota dest_quota = - Hashtbl.iter (fun id nb -> let diff = nb - (try get_entry orig_quota id with Not_found -> 0) in - if diff <> 0 then - set_entry dest_quota id ((try get_entry dest_quota id with Not_found -> 0) + diff)) mod_quota.cur + let fold_merge id nb dest = + match nb - find_or_zero orig_quota.cur id with + | 0 -> dest (* not modified *) + | diff -> update_entry dest id diff (* update with [x=x+diff] *) + in + {dest_quota with cur = DomidMap.fold fold_merge mod_quota.cur dest_quota.cur} + (* dest_quota = dest_quota + (mod_quota - orig_quota) *) diff -Nru xen-4.17.3+10-g091466ba55/tools/ocaml/xenstored/store.ml xen-4.17.5/tools/ocaml/xenstored/store.ml --- xen-4.17.3+10-g091466ba55/tools/ocaml/xenstored/store.ml 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/ocaml/xenstored/store.ml 2024-08-14 09:03:57.000000000 +0000 @@ -85,7 +85,9 @@ raise Define.Permission_denied; end -let rec recurse fct node = fct node; SymbolMap.iter (fun _ -> recurse fct) node.children +let rec recurse fct node acc = + let acc = fct node acc in + SymbolMap.fold (fun _ -> recurse fct) node.children acc (** [recurse_filter_map f tree] applies [f] on each node in the tree recursively, possibly removing some nodes. @@ -408,7 +410,7 @@ let set_node store path node orig_quota mod_quota = let root = Path.set_node store.root path node in store.root <- root; - Quota.merge orig_quota mod_quota store.quota + store.quota <- Quota.merge orig_quota mod_quota store.quota let write store perm path value = let node, existing = get_deepest_existing_node store path in @@ -422,7 +424,7 @@ let root, node_created = path_write store perm path value in store.root <- root; if node_created - then Quota.add_entry store.quota owner + then store.quota <- Quota.add_entry store.quota owner let mkdir store perm path = let node, existing = get_deepest_existing_node store path in @@ -431,7 +433,7 @@ if not (existing || (Perms.Connection.is_dom0 perm)) then Quota.check store.quota owner 0; store.root <- path_mkdir store perm path; if not existing then - Quota.add_entry store.quota owner + store.quota <- Quota.add_entry store.quota owner let rm store perm path = let rmed_node = Path.get_node store.root path in @@ -439,7 +441,7 @@ | None -> raise Define.Doesnt_exist | Some rmed_node -> store.root <- path_rm store perm path; - Node.recurse (fun node -> Quota.del_entry store.quota (Node.get_owner node)) rmed_node + store.quota <- Node.recurse (fun node quota -> Quota.del_entry quota (Node.get_owner node)) rmed_node store.quota let setperms store perm path nperms = match Path.get_node store.root path with @@ -450,8 +452,9 @@ if not ((old_owner = new_owner) || (Perms.Connection.is_dom0 perm)) then raise Define.Permission_denied; store.root <- path_setperms store perm path nperms; - Quota.del_entry store.quota old_owner; - Quota.add_entry store.quota new_owner + store.quota <- + let quota = Quota.del_entry store.quota old_owner in + Quota.add_entry quota new_owner let reset_permissions store domid = Logging.info "store|node" "Cleaning up xenstore ACLs for domid %d" domid; diff -Nru xen-4.17.3+10-g091466ba55/tools/ocaml/xenstored/systemd_stubs.c xen-4.17.5/tools/ocaml/xenstored/systemd_stubs.c --- xen-4.17.3+10-g091466ba55/tools/ocaml/xenstored/systemd_stubs.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/ocaml/xenstored/systemd_stubs.c 2024-08-14 09:03:57.000000000 +0000 @@ -25,7 +25,7 @@ #if defined(HAVE_SYSTEMD) -#include +#include CAMLprim value ocaml_sd_notify_ready(value ignore) { diff -Nru xen-4.17.3+10-g091466ba55/tools/tests/resource/test-resource.c xen-4.17.5/tools/tests/resource/test-resource.c --- xen-4.17.3+10-g091466ba55/tools/tests/resource/test-resource.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/tests/resource/test-resource.c 2024-08-14 09:03:57.000000000 +0000 @@ -20,6 +20,8 @@ static xenforeignmemory_handle *fh; static xengnttab_handle *gh; +static xc_physinfo_t physinfo; + static void test_gnttab(uint32_t domid, unsigned int nr_frames, unsigned long gfn) { @@ -172,6 +174,37 @@ printf("Test %s\n", t->name); +#if defined(__x86_64__) || defined(__i386__) + if ( t->create.flags & XEN_DOMCTL_CDF_hvm ) + { + if ( !(physinfo.capabilities & XEN_SYSCTL_PHYSCAP_hvm) ) + { + printf(" Skip: HVM not available\n"); + continue; + } + + /* + * On x86, use HAP guests if possible, but skip if neither HAP nor + * SHADOW is available. + */ + if ( physinfo.capabilities & XEN_SYSCTL_PHYSCAP_hap ) + t->create.flags |= XEN_DOMCTL_CDF_hap; + else if ( !(physinfo.capabilities & XEN_SYSCTL_PHYSCAP_shadow) ) + { + printf(" Skip: Neither HAP or SHADOW available\n"); + continue; + } + } + else + { + if ( !(physinfo.capabilities & XEN_SYSCTL_PHYSCAP_pv) ) + { + printf(" Skip: PV not available\n"); + continue; + } + } +#endif + rc = xc_domain_create(xch, &domid, &t->create); if ( rc ) { @@ -214,6 +247,8 @@ int main(int argc, char **argv) { + int rc; + printf("XENMEM_acquire_resource tests\n"); xch = xc_interface_open(NULL, NULL, 0); @@ -227,6 +262,10 @@ if ( !gh ) err(1, "xengnttab_open"); + rc = xc_physinfo(xch, &physinfo); + if ( rc ) + err(1, "Failed to obtain physinfo"); + test_domain_configurations(); return !!nr_failures; diff -Nru xen-4.17.3+10-g091466ba55/tools/tests/tsx/test-tsx.c xen-4.17.5/tools/tests/tsx/test-tsx.c --- xen-4.17.3+10-g091466ba55/tools/tests/tsx/test-tsx.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/tests/tsx/test-tsx.c 2024-08-14 09:03:57.000000000 +0000 @@ -311,25 +311,25 @@ dump_tsx_details(max, "Max:"); dump_tsx_details(def, "Def:"); - if ( ((max->feat.raw[0].d | def->feat.raw[0].d) & - (bitmaskof(X86_FEATURE_TSX_FORCE_ABORT) | - bitmaskof(X86_FEATURE_RTM_ALWAYS_ABORT) | - bitmaskof(X86_FEATURE_SRBDS_CTRL))) || - ((max->arch_caps.raw | def->arch_caps.raw) & ARCH_CAPS_TSX_CTRL) ) + if ( max->feat.tsx_force_abort || def->feat.tsx_force_abort || + max->feat.srbds_ctrl || def->feat.srbds_ctrl || + max->arch_caps.tsx_ctrl || def->arch_caps.tsx_ctrl ) fail(" Xen-only TSX controls offered to guest\n"); switch ( rtm_behaviour ) { case RTM_UD: - if ( (max->feat.raw[0].b | def->feat.raw[0].b) & - (bitmaskof(X86_FEATURE_HLE) | bitmaskof(X86_FEATURE_RTM)) ) - fail(" HLE/RTM offered to guests despite not being available\n"); + if ( max->feat.hle || def->feat.hle || + max->feat.rtm || def->feat.rtm || + max->feat.rtm_always_abort || def->feat.rtm_always_abort ) + fail(" HLE/RTM/RTM_AA offered to guests despite not being available\n"); break; case RTM_ABORT: - if ( def->feat.raw[0].b & - (bitmaskof(X86_FEATURE_HLE) | bitmaskof(X86_FEATURE_RTM)) ) + if ( def->feat.hle || def->feat.rtm ) fail(" HLE/RTM offered to guests by default despite not being usable\n"); + if ( !def->feat.rtm_always_abort ) + fail(" RTM_AA not offered to guests by default despite being available\n"); break; case RTM_OK: @@ -340,6 +340,9 @@ if ( def->feat.hle ) fail(" Fail: HLE offered in default policy\n"); + + if ( def->feat.rtm && def->feat.rtm_always_abort ) + fail(" Fail: Both RTM and RTM_AA offered in default policy\n"); } static void test_def_max_policies(void) @@ -388,14 +391,13 @@ if ( guest_policy.policy.feat.hle || guest_policy.policy.feat.tsx_force_abort || - guest_policy.policy.feat.rtm_always_abort || guest_policy.policy.feat.srbds_ctrl || guest_policy.policy.arch_caps.tsx_ctrl ) fail(" Unexpected features advertised\n"); if ( host.policy.feat.rtm ) { - unsigned int _7b0; + unsigned int _7b0, _7d0; /* * If host RTM is available, all combinations of guest flags should be @@ -403,6 +405,8 @@ */ _7b0 = (guest_policy.policy.feat.raw[0].b ^= (bitmaskof(X86_FEATURE_HLE) | bitmaskof(X86_FEATURE_RTM))); + _7d0 = (guest_policy.policy.feat.raw[0].d ^= + bitmaskof(X86_FEATURE_RTM_ALWAYS_ABORT)); /* Set the new policy. */ rc = xc_cpu_policy_set_domain(xch, domid, &guest_policy); @@ -426,10 +430,17 @@ if ( guest_policy.policy.feat.raw[0].b != _7b0 ) { - fail(" Expected CPUID.7[1].b 0x%08x differs from actual 0x%08x\n", + fail(" Expected CPUID.7[0].b 0x%08x differs from actual 0x%08x\n", _7b0, guest_policy.policy.feat.raw[0].b); goto out; } + + if ( guest_policy.policy.feat.raw[0].d != _7d0 ) + { + fail(" Expected CPUID.7[0].d 0x%08x differs from actual 0x%08x\n", + _7d0, guest_policy.policy.feat.raw[0].d); + goto out; + } } out: @@ -514,6 +525,8 @@ i, errno, strerror(errno)); } + dump_tsx_details(&host.policy, "Host:"); + rc = xc_physinfo(xch, &physinfo); if ( rc ) return fail("Failed to obtain physinfo: %d - %s\n", diff -Nru xen-4.17.3+10-g091466ba55/tools/tests/xenstore/test-xenstore.c xen-4.17.5/tools/tests/xenstore/test-xenstore.c --- xen-4.17.3+10-g091466ba55/tools/tests/xenstore/test-xenstore.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/tests/xenstore/test-xenstore.c 2024-08-14 09:03:57.000000000 +0000 @@ -408,9 +408,9 @@ #define TEST(s, f, p, l) { s, f ## _init, f, f ## _deinit, (uintptr_t)(p), l } struct test tests[] = { TEST("read 1", test_read, 1, "Read node with 1 byte data"), -TEST("read 3000", test_read, 3000, "Read node with 3000 bytes data"), +TEST("read 2000", test_read, 2000, "Read node with 2000 bytes data"), TEST("write 1", test_write, 1, "Write node with 1 byte data"), -TEST("write 3000", test_write, 3000, "Write node with 3000 bytes data"), +TEST("write 2000", test_write, 2000, "Write node with 2000 bytes data"), TEST("dir", test_dir, 0, "List directory"), TEST("rm node", test_rm, 0, "Remove single node"), TEST("rm dir", test_rm, WRITE_BUFFERS_N, "Remove node with sub-nodes"), @@ -506,14 +506,14 @@ stop = time(NULL) + randtime; srandom((unsigned int)stop); - while ( time(NULL) < stop ) + while ( time(NULL) < stop && !ret ) { t = random() % ARRAY_SIZE(tests); ret = call_test(tests + t, iters, true); } } else - for ( t = 0; t < ARRAY_SIZE(tests); t++ ) + for ( t = 0; t < ARRAY_SIZE(tests) && !ret; t++ ) { if ( !test || !strcmp(test, tests[t].name) ) ret = call_test(tests + t, iters, false); @@ -525,10 +525,10 @@ xs_close(xsh); if ( ta_loops ) - printf("Exhaustive transaction retries (%d) occurrred %d times.\n", + printf("Exhaustive transaction retries (%d) occurred %d times.\n", MAX_TA_LOOPS, ta_loops); - return 0; + return ret ? 3 : 0; } /* diff -Nru xen-4.17.3+10-g091466ba55/tools/xcutils/lsevtchn.c xen-4.17.5/tools/xcutils/lsevtchn.c --- xen-4.17.3+10-g091466ba55/tools/xcutils/lsevtchn.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/xcutils/lsevtchn.c 2024-08-14 09:03:57.000000000 +0000 @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -24,7 +25,23 @@ status.port = port; rc = xc_evtchn_status(xch, &status); if ( rc < 0 ) - break; + { + switch ( errno ) + { + case EACCES: /* Xen-owned evtchn */ + continue; + + case EINVAL: /* Port enumeration has ended */ + rc = 0; + break; + + default: + perror("xc_evtchn_status"); + rc = 1; + break; + } + goto out; + } if ( status.status == EVTCHNSTAT_closed ) continue; @@ -58,7 +75,8 @@ printf("\n"); } + out: xc_interface_close(xch); - return 0; + return rc; } diff -Nru xen-4.17.3+10-g091466ba55/tools/xenstore/Makefile xen-4.17.5/tools/xenstore/Makefile --- xen-4.17.3+10-g091466ba55/tools/xenstore/Makefile 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/xenstore/Makefile 2024-08-14 09:03:57.000000000 +0000 @@ -9,11 +9,6 @@ xenstored: LDLIBS += -lrt xenstored: LDLIBS += $(SOCKET_LIBS) -ifeq ($(CONFIG_SYSTEMD),y) -$(XENSTORED_OBJS-y): CFLAGS += $(SYSTEMD_CFLAGS) -xenstored: LDLIBS += $(SYSTEMD_LIBS) -endif - xenstore: LDLIBS += $(LDLIBS_libxenstore) xenstore: LDLIBS += $(LDLIBS_libxentoolcore) xenstore: LDLIBS += $(SOCKET_LIBS) diff -Nru xen-4.17.3+10-g091466ba55/tools/xenstore/xenstored_core.c xen-4.17.5/tools/xenstore/xenstored_core.c --- xen-4.17.3+10-g091466ba55/tools/xenstore/xenstored_core.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/xenstore/xenstored_core.c 2024-08-14 09:03:57.000000000 +0000 @@ -61,7 +61,7 @@ #endif #if defined(XEN_SYSTEMD_ENABLED) -#include +#include #endif extern xenevtchn_handle *xce_handle; /* in xenstored_domain.c */ @@ -2888,7 +2888,7 @@ #if defined(XEN_SYSTEMD_ENABLED) if (!live_update) { sd_notify(1, "READY=1"); - fprintf(stderr, SD_NOTICE "xenstored is ready\n"); + fprintf(stderr, "xenstored is ready\n"); } #endif diff -Nru xen-4.17.3+10-g091466ba55/tools/xentop/xentop.c xen-4.17.5/tools/xentop/xentop.c --- xen-4.17.3+10-g091466ba55/tools/xentop/xentop.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/xentop/xentop.c 2024-08-14 09:03:57.000000000 +0000 @@ -85,6 +85,7 @@ static void set_prompt(const char *new_prompt, void (*func)(const char *)); static int handle_key(int); static int compare(unsigned long long, unsigned long long); +static int compare_dbl(double, double); static int compare_domains(xenstat_domain **, xenstat_domain **); static unsigned long long tot_net_bytes( xenstat_domain *, int); static bool tot_vbd_reqs(xenstat_domain *, int, unsigned long long *); @@ -422,6 +423,16 @@ return 0; } +/* Compares two double precision numbers, returning -1,0,1 for <,=,> */ +static int compare_dbl(double d1, double d2) +{ + if (d1 < d2) + return -1; + if (d1 > d2) + return 1; + return 0; +} + /* Comparison function for use with qsort. Compares two domains using the * current sort field. */ static int compare_domains(xenstat_domain **domain1, xenstat_domain **domain2) @@ -523,7 +534,7 @@ static int compare_cpu_pct(xenstat_domain *domain1, xenstat_domain *domain2) { - return -compare(get_cpu_pct(domain1), get_cpu_pct(domain2)); + return -compare_dbl(get_cpu_pct(domain1), get_cpu_pct(domain2)); } /* Prints cpu percentage statistic */ @@ -684,7 +695,7 @@ unsigned long long dom1_vbd_oo = 0, dom2_vbd_oo = 0; tot_vbd_reqs(domain1, FIELD_VBD_OO, &dom1_vbd_oo); - tot_vbd_reqs(domain1, FIELD_VBD_OO, &dom2_vbd_oo); + tot_vbd_reqs(domain2, FIELD_VBD_OO, &dom2_vbd_oo); return -compare(dom1_vbd_oo, dom2_vbd_oo); } @@ -711,9 +722,9 @@ unsigned long long dom1_vbd_rd = 0, dom2_vbd_rd = 0; tot_vbd_reqs(domain1, FIELD_VBD_RD, &dom1_vbd_rd); - tot_vbd_reqs(domain1, FIELD_VBD_RD, &dom2_vbd_rd); + tot_vbd_reqs(domain2, FIELD_VBD_RD, &dom2_vbd_rd); - return -compare(dom1_vbd_rd, dom1_vbd_rd); + return -compare(dom1_vbd_rd, dom2_vbd_rd); } /* Prints number of total VBD READ requests statistic */ @@ -738,7 +749,7 @@ unsigned long long dom1_vbd_wr = 0, dom2_vbd_wr = 0; tot_vbd_reqs(domain1, FIELD_VBD_WR, &dom1_vbd_wr); - tot_vbd_reqs(domain1, FIELD_VBD_WR, &dom2_vbd_wr); + tot_vbd_reqs(domain2, FIELD_VBD_WR, &dom2_vbd_wr); return -compare(dom1_vbd_wr, dom2_vbd_wr); } @@ -765,7 +776,7 @@ unsigned long long dom1_vbd_rsect = 0, dom2_vbd_rsect = 0; tot_vbd_reqs(domain1, FIELD_VBD_RSECT, &dom1_vbd_rsect); - tot_vbd_reqs(domain1, FIELD_VBD_RSECT, &dom2_vbd_rsect); + tot_vbd_reqs(domain2, FIELD_VBD_RSECT, &dom2_vbd_rsect); return -compare(dom1_vbd_rsect, dom2_vbd_rsect); } diff -Nru xen-4.17.3+10-g091466ba55/tools/xl/xl_utils.c xen-4.17.5/tools/xl/xl_utils.c --- xen-4.17.3+10-g091466ba55/tools/xl/xl_utils.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/tools/xl/xl_utils.c 2024-08-14 09:03:57.000000000 +0000 @@ -27,6 +27,10 @@ #include "xl.h" #include "xl_utils.h" +#ifndef O_CLOEXEC +#define O_CLOEXEC 0 +#endif + void dolog(const char *file, int line, const char *func, const char *fmt, ...) { va_list ap; @@ -270,7 +274,7 @@ exit(-1); } - CHK_SYSCALL(logfile = open(fullname, O_WRONLY|O_CREAT|O_APPEND, 0644)); + CHK_SYSCALL(logfile = open(fullname, O_WRONLY | O_CREAT | O_APPEND | O_CLOEXEC, 0644)); free(fullname); assert(logfile >= 3); diff -Nru xen-4.17.3+10-g091466ba55/xen/Makefile xen-4.17.5/xen/Makefile --- xen-4.17.3+10-g091466ba55/xen/Makefile 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/Makefile 2024-08-14 09:03:57.000000000 +0000 @@ -6,7 +6,7 @@ # All other places this is stored (eg. compile.h) should be autogenerated. export XEN_VERSION = 4 export XEN_SUBVERSION = 17 -export XEN_EXTRAVERSION ?= .4-pre$(XEN_VENDORVERSION) +export XEN_EXTRAVERSION ?= .5$(XEN_VENDORVERSION) export XEN_FULLVERSION = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION) -include xen-version @@ -25,8 +25,8 @@ endif # Best effort attempt to find a python interpreter, defaulting to Python 3 if -# available. Fall back to just `python` if `which` is nowhere to be found. -PYTHON_INTERPRETER := $(word 1,$(shell which python3 python python2 2>/dev/null) python) +# available. Fall back to just `python`. +PYTHON_INTERPRETER := $(word 1,$(shell command -v python3 || command -v python || command -v python2) python) export PYTHON ?= $(PYTHON_INTERPRETER) export CHECKPOLICY ?= checkpolicy @@ -374,6 +374,7 @@ # This exploits the 'multi-target pattern rule' trick. # The syncconfig should be executed only once to make all the targets. include/config/%.conf include/config/%.conf.cmd: $(KCONFIG_CONFIG) + $(Q)rm -f include/config/auto.conf $(Q)$(MAKE) $(build)=tools/kconfig syncconfig ifeq ($(CONFIG_DEBUG),y) diff -Nru xen-4.17.3+10-g091466ba55/xen/Rules.mk xen-4.17.5/xen/Rules.mk --- xen-4.17.3+10-g091466ba55/xen/Rules.mk 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/Rules.mk 2024-08-14 09:03:57.000000000 +0000 @@ -15,7 +15,9 @@ PHONY := __build __build: --include $(objtree)/include/config/auto.conf +ifneq ($(firstword $(subst /, ,$(obj))),tools) +include $(objtree)/include/config/auto.conf +endif include $(XEN_ROOT)/Config.mk include $(srctree)/scripts/Kbuild.include diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/arm/alternative.c xen-4.17.5/xen/arch/arm/alternative.c --- xen-4.17.3+10-g091466ba55/xen/arch/arm/alternative.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/arm/alternative.c 2024-08-14 09:03:57.000000000 +0000 @@ -55,7 +55,7 @@ return true; replptr = (unsigned long)ALT_REPL_PTR(alt); - if ( pc >= replptr && pc <= (replptr + alt->alt_len) ) + if ( pc >= replptr && pc <= (replptr + alt->repl_len) ) return false; /* @@ -139,9 +139,9 @@ continue; if ( alt->cpufeature == ARM_CB_PATCH ) - BUG_ON(alt->alt_len != 0); + BUG_ON(alt->repl_len != 0); else - BUG_ON(alt->alt_len != alt->orig_len); + BUG_ON(alt->repl_len != alt->orig_len); origptr = ALT_ORIG_PTR(alt); updptr = (void *)origptr + update_offset; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/arm/include/asm/alternative.h xen-4.17.5/xen/arch/arm/include/asm/alternative.h --- xen-4.17.3+10-g091466ba55/xen/arch/arm/include/asm/alternative.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/arm/include/asm/alternative.h 2024-08-14 09:03:57.000000000 +0000 @@ -13,16 +13,16 @@ struct alt_instr { s32 orig_offset; /* offset to original instruction */ - s32 alt_offset; /* offset to replacement instruction */ + s32 repl_offset; /* offset to replacement instruction */ u16 cpufeature; /* cpufeature bit set for replacement */ u8 orig_len; /* size of original instruction(s) */ - u8 alt_len; /* size of new instruction(s), <= orig_len */ + u8 repl_len; /* size of new instruction(s), <= orig_len */ }; /* Xen: helpers used by common code. */ #define __ALT_PTR(a,f) ((void *)&(a)->f + (a)->f) #define ALT_ORIG_PTR(a) __ALT_PTR(a, orig_offset) -#define ALT_REPL_PTR(a) __ALT_PTR(a, alt_offset) +#define ALT_REPL_PTR(a) __ALT_PTR(a, repl_offset) typedef void (*alternative_cb_t)(const struct alt_instr *alt, const uint32_t *origptr, uint32_t *updptr, @@ -90,12 +90,12 @@ #include #include -.macro altinstruction_entry orig_offset alt_offset feature orig_len alt_len +.macro altinstruction_entry orig_offset repl_offset feature orig_len repl_len .word \orig_offset - . - .word \alt_offset - . + .word \repl_offset - . .hword \feature .byte \orig_len - .byte \alt_len + .byte \repl_len .endm .macro alternative_insn insn1, insn2, cap, enable = 1 diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/arm/irq.c xen-4.17.5/xen/arch/arm/irq.c --- xen-4.17.3+10-g091466ba55/xen/arch/arm/irq.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/arm/irq.c 2024-08-14 09:03:57.000000000 +0000 @@ -229,6 +229,7 @@ { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; + struct cpu_user_regs *old_regs = set_irq_regs(regs); perfc_incr(irqs); @@ -296,6 +297,7 @@ out_no_end: spin_unlock(&desc->lock); irq_exit(); + set_irq_regs(old_regs); } void release_irq(unsigned int irq, const void *dev_id) diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/arm/setup.c xen-4.17.5/xen/arch/arm/setup.c --- xen-4.17.3+10-g091466ba55/xen/arch/arm/setup.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/arm/setup.c 2024-08-14 09:03:57.000000000 +0000 @@ -1025,7 +1025,7 @@ /* Register Xen's load address as a boot module. */ xen_bootmodule = add_boot_module(BOOTMOD_XEN, - (paddr_t)(uintptr_t)(_start + boot_phys_offset), + virt_to_maddr(_start), (paddr_t)(uintptr_t)(_end - _start), false); BUG_ON(!xen_bootmodule); diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/Makefile xen-4.17.5/xen/arch/x86/Makefile --- xen-4.17.3+10-g091466ba55/xen/arch/x86/Makefile 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/Makefile 2024-08-14 09:03:57.000000000 +0000 @@ -14,6 +14,7 @@ alternative-$(CONFIG_LIVEPATCH) := obj-bin-y += $(alternative-y) obj-y += apic.o +obj-y += bhb-thunk.o obj-y += bitops.o obj-bin-y += bzimage.init.o obj-bin-y += clear_page.o @@ -43,7 +44,7 @@ obj-y += msi.o obj-y += msr.o obj-$(CONFIG_INDIRECT_THUNK) += indirect-thunk.o -obj-y += ioport_emulate.o +obj-$(CONFIG_PV) += ioport_emulate.o obj-y += irq.o obj-$(CONFIG_KEXEC) += machine_kexec.o obj-y += mm.o x86_64/mm.o diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/acpi/cpufreq/cpufreq.c xen-4.17.5/xen/arch/x86/acpi/cpufreq/cpufreq.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/acpi/cpufreq/cpufreq.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/acpi/cpufreq/cpufreq.c 2024-08-14 09:03:57.000000000 +0000 @@ -622,12 +622,14 @@ return 0; } -static const struct cpufreq_driver __initconstrel acpi_cpufreq_driver = { +static const struct cpufreq_driver __initconst_cf_clobber +acpi_cpufreq_driver = { .name = "acpi-cpufreq", .verify = acpi_cpufreq_verify, .target = acpi_cpufreq_target, .init = acpi_cpufreq_cpu_init, .exit = acpi_cpufreq_cpu_exit, + .get = get_cur_freq_on_cpu, }; static int __init cf_check cpufreq_driver_init(void) @@ -653,6 +655,19 @@ } presmp_initcall(cpufreq_driver_init); +static int __init cf_check cpufreq_driver_late_init(void) +{ + /* + * While acpi_cpufreq_driver wants to unconditionally have all hooks + * populated for __initconst_cf_clobber to have as much of an effect as + * possible, zap the .get hook here (but not in cpufreq_driver_init()), + * until acpi_cpufreq_cpu_init() knows whether it's wanted / needed. + */ + cpufreq_driver.get = NULL; + return 0; +} +__initcall(cpufreq_driver_late_init); + int cpufreq_cpu_init(unsigned int cpuid) { int ret; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/acpi/cpufreq/powernow.c xen-4.17.5/xen/arch/x86/acpi/cpufreq/powernow.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/acpi/cpufreq/powernow.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/acpi/cpufreq/powernow.c 2024-08-14 09:03:57.000000000 +0000 @@ -317,7 +317,8 @@ return 0; } -static const struct cpufreq_driver __initconstrel powernow_cpufreq_driver = { +static const struct cpufreq_driver __initconst_cf_clobber +powernow_cpufreq_driver = { .name = "powernow", .verify = powernow_cpufreq_verify, .target = powernow_cpufreq_target, diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/acpi/power.c xen-4.17.5/xen/arch/x86/acpi/power.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/acpi/power.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/acpi/power.c 2024-08-14 09:03:57.000000000 +0000 @@ -246,7 +246,7 @@ ci = get_cpu_info(); /* Avoid NMI/#MC using unsafe MSRs until we've reloaded microcode. */ - ci->spec_ctrl_flags &= ~SCF_IST_MASK; + ci->scf &= ~SCF_IST_MASK; ACPI_FLUSH_CPU_CACHE(); @@ -290,7 +290,7 @@ panic("Missing previously available feature(s)\n"); /* Re-enabled default NMI/#MC use of MSRs now microcode is loaded. */ - ci->spec_ctrl_flags |= (default_spec_ctrl_flags & SCF_IST_MASK); + ci->scf |= (default_scf & SCF_IST_MASK); if ( boot_cpu_has(X86_FEATURE_IBRSB) || boot_cpu_has(X86_FEATURE_IBRS) ) { diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/alternative.c xen-4.17.5/xen/arch/x86/alternative.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/alternative.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/alternative.c 2024-08-14 09:03:57.000000000 +0000 @@ -338,7 +338,7 @@ * Clobber endbr64 instructions now that altcall has finished optimising * all indirect branches to direct ones. */ - if ( force && cpu_has_xen_ibt ) + if ( force && cpu_has_xen_ibt && system_state < SYS_STATE_active ) { void *const *val; unsigned int clobbered = 0; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/bhb-thunk.S xen-4.17.5/xen/arch/x86/bhb-thunk.S --- xen-4.17.3+10-g091466ba55/xen/arch/x86/bhb-thunk.S 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/bhb-thunk.S 2024-08-14 09:03:57.000000000 +0000 @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Branch History Injection clearing sequences. + * + * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/branch-history-injection.html + * + * Copyright (c) 2023, 2024 XenServer. + */ + .file __FILE__ + +#include + + .section .text.entry, "ax", @progbits + +/* + * Clear the Branch History Buffer using a TSX Abort. + * + * Any TSX Abort has a side effect of clearing the BHB, even when TSX is + * disabled for e.g. TAA mitigation reasons. + */ +ENTRY(clear_bhb_tsx) + .byte 0xc7, 0xf8; .long 1f - 0f /* xbegin 1f */ +0: .byte 0xc6, 0xf8, 0 /* xabort $0 */ + int3 +1: + ret + + .size clear_bhb_tsx, . - clear_bhb_tsx + .type clear_bhb_tsx, @function + +/* + * Clear the Branch History Buffer using the software sequence. + * + * Clobbers: %eax, %ecx + * + * This executes a specific number of taken branches, sufficient to displace + * all prior entries in the history tracker, therefore removing prior + * influence on subsequent BTB lookups. + * + * Structurally, it looks like this: + * + * call 1 + * call 2 + * ... 5x jmp loop + * call 2 + * ... 5x jmp loop + * ... 5x call2's deep + * + * ret + * ret + * ret + * ret + * + * The CALL/RETs are necessary to prevent the Loop Stream Detector from + * interfering. The alignment is for performance and not safety. + * + * The "short" sequence (5 and 5) is for CPUs prior to Alder Lake / Sapphire + * Rapids (i.e. Cores prior to Golden Cove and/or Gracemont). + * + * The "long" sequence (12 and 7) is for Alder Lake / Sapphire Rapids + * (i.e. Golden Cove and/or Gracemont cores). However, such CPUs are expected + * to use BHI_DIS_S in preference. + */ +ENTRY(clear_bhb_loops) + ALTERNATIVE "mov $5, %ecx", "mov $12, %ecx", X86_SPEC_BHB_LOOPS_LONG + + call 1f + jmp 5f + int3 + + .align 64 +1: call 2f + ret + int3 + + .align 64 +2: ALTERNATIVE "mov $5, %eax", "mov $7, %eax", X86_SPEC_BHB_LOOPS_LONG + +3: jmp 4f + int3 + +4: sub $1, %eax + jnz 3b + + sub $1, %ecx + jnz 1b + + ret +5: + /* + * The Intel sequence has an LFENCE here. The purpose is to ensure + * that all prior branches have executed, before dispatching a + * subsequent indirect branch. + * + * Xen's SPEC_CTRL_ENTRY_* blocks have safety LFENCEs at the end when + * protections are active, which suffices for this purpose. + */ + + ret + + .size clear_bhb_loops, . - clear_bhb_loops + .type clear_bhb_loops, @function diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/amd.c xen-4.17.5/xen/arch/x86/cpu/amd.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/amd.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/cpu/amd.c 2024-08-14 09:03:57.000000000 +0000 @@ -258,6 +258,11 @@ #undef LAZY } +#ifdef CONFIG_XEN_IBT /* Announce the function to ENDBR clobbering logic. */ +static const typeof(ctxt_switch_masking) __initconst_cf_clobber __used csm = + amd_ctxt_switch_masking; +#endif + /* * Mask the features and extended features returned by CPUID. Parameters are * set from the boot line via two methods: @@ -1281,7 +1286,7 @@ amd_log_freq(c); } -const struct cpu_dev amd_cpu_dev = { +const struct cpu_dev __initconst_cf_clobber amd_cpu_dev = { .c_early_init = early_init_amd, .c_init = init_amd, }; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/centaur.c xen-4.17.5/xen/arch/x86/cpu/centaur.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/centaur.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/cpu/centaur.c 2024-08-14 09:03:57.000000000 +0000 @@ -54,6 +54,6 @@ init_c3(c); } -const struct cpu_dev centaur_cpu_dev = { +const struct cpu_dev __initconst_cf_clobber centaur_cpu_dev = { .c_init = init_centaur, }; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/common.c xen-4.17.5/xen/arch/x86/cpu/common.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/common.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/cpu/common.c 2024-08-14 09:03:57.000000000 +0000 @@ -115,13 +115,13 @@ __clear_bit(X86_FEATURE_SEP, c->x86_capability); } -static const struct cpu_dev default_cpu = { +static const struct cpu_dev __initconst_cf_clobber __used default_cpu = { .c_init = default_init, }; -static const struct cpu_dev *this_cpu = &default_cpu; +static struct cpu_dev __ro_after_init actual_cpu; static DEFINE_PER_CPU(uint64_t, msr_misc_features); -void (* __read_mostly ctxt_switch_masking)(const struct vcpu *next); +void (* __ro_after_init ctxt_switch_masking)(const struct vcpu *next); bool __init probe_cpuid_faulting(void) { @@ -343,12 +343,14 @@ c->x86_vendor = x86_cpuid_lookup_vendor(ebx, ecx, edx); switch (c->x86_vendor) { - case X86_VENDOR_INTEL: this_cpu = &intel_cpu_dev; break; - case X86_VENDOR_AMD: this_cpu = &amd_cpu_dev; break; - case X86_VENDOR_CENTAUR: this_cpu = ¢aur_cpu_dev; break; - case X86_VENDOR_SHANGHAI: this_cpu = &shanghai_cpu_dev; break; - case X86_VENDOR_HYGON: this_cpu = &hygon_cpu_dev; break; + case X86_VENDOR_INTEL: intel_unlock_cpuid_leaves(c); + actual_cpu = intel_cpu_dev; break; + case X86_VENDOR_AMD: actual_cpu = amd_cpu_dev; break; + case X86_VENDOR_CENTAUR: actual_cpu = centaur_cpu_dev; break; + case X86_VENDOR_SHANGHAI: actual_cpu = shanghai_cpu_dev; break; + case X86_VENDOR_HYGON: actual_cpu = hygon_cpu_dev; break; default: + actual_cpu = default_cpu; printk(XENLOG_ERR "Unrecognised or unsupported CPU vendor '%.12s'\n", c->x86_vendor_id); @@ -434,8 +436,8 @@ c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0); c->phys_proc_id = c->apicid; - if (this_cpu->c_early_init) - this_cpu->c_early_init(c); + if (actual_cpu.c_early_init) + alternative_vcall(actual_cpu.c_early_init, c); /* c_early_init() may have adjusted cpuid levels/features. Reread. */ c->cpuid_level = cpuid_eax(0); @@ -540,9 +542,8 @@ * At the end of this section, c->x86_capability better * indicate the features this CPU genuinely supports! */ - if (this_cpu->c_init) - this_cpu->c_init(c); - + if (actual_cpu.c_init) + alternative_vcall(actual_cpu.c_init, c); if (c == &boot_cpu_data && !opt_pku) setup_clear_cpu_cap(X86_FEATURE_PKU); diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/cpu.h xen-4.17.5/xen/arch/x86/cpu/cpu.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/cpu.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/cpu/cpu.h 2024-08-14 09:03:57.000000000 +0000 @@ -24,3 +24,5 @@ void amd_init_ssbd(const struct cpuinfo_x86 *c); void amd_init_spectral_chicken(void); void detect_zen2_null_seg_behaviour(void); + +void intel_unlock_cpuid_leaves(struct cpuinfo_x86 *c); diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/hygon.c xen-4.17.5/xen/arch/x86/cpu/hygon.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/hygon.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/cpu/hygon.c 2024-08-14 09:03:57.000000000 +0000 @@ -88,7 +88,7 @@ amd_log_freq(c); } -const struct cpu_dev hygon_cpu_dev = { +const struct cpu_dev __initconst_cf_clobber hygon_cpu_dev = { .c_early_init = early_init_amd, .c_init = init_hygon, }; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/intel.c xen-4.17.5/xen/arch/x86/cpu/intel.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/intel.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/cpu/intel.c 2024-08-14 09:03:57.000000000 +0000 @@ -220,6 +220,11 @@ #undef LAZY } +#ifdef CONFIG_XEN_IBT /* Announce the function to ENDBR clobbering logic. */ +static const typeof(ctxt_switch_masking) __initconst_cf_clobber __used csm = + intel_ctxt_switch_masking; +#endif + /* * opt_cpuid_mask_ecx/edx: cpuid.1[ecx, edx] feature mask. * For example, E8400[Intel Core 2 Duo Processor series] ecx = 0x0008E3FD, @@ -288,15 +293,11 @@ ctxt_switch_masking = intel_ctxt_switch_masking; } -static void cf_check early_init_intel(struct cpuinfo_x86 *c) +/* Unmask CPUID levels (and NX) if masked. */ +void intel_unlock_cpuid_leaves(struct cpuinfo_x86 *c) { - u64 misc_enable, disable; + uint64_t misc_enable, disable; - /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ - if (c->x86 == 15 && c->x86_cache_alignment == 64) - c->x86_cache_alignment = 128; - - /* Unmask CPUID levels and NX if masked: */ rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); disable = misc_enable & (MSR_IA32_MISC_ENABLE_LIMIT_CPUID | @@ -304,17 +305,26 @@ if (disable) { wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable & ~disable); bootsym(trampoline_misc_enable_off) |= disable; - bootsym(trampoline_efer) |= EFER_NXE; } - - if (disable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) - printk(KERN_INFO "revised cpuid level: %d\n", - cpuid_eax(0)); + if (disable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { + c->cpuid_level = cpuid_eax(0); + printk(KERN_INFO "revised cpuid level: %u\n", c->cpuid_level); + } if (disable & MSR_IA32_MISC_ENABLE_XD_DISABLE) { + bootsym(trampoline_efer) |= EFER_NXE; write_efer(read_efer() | EFER_NXE); printk(KERN_INFO "re-enabled NX (Execute Disable) protection\n"); } +} + +static void cf_check early_init_intel(struct cpuinfo_x86 *c) +{ + /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ + if (c->x86 == 15 && c->x86_cache_alignment == 64) + c->x86_cache_alignment = 128; + + intel_unlock_cpuid_leaves(c); /* CPUID workaround for Intel 0F33/0F34 CPU */ if (boot_cpu_data.x86 == 0xF && boot_cpu_data.x86_model == 3 && @@ -593,7 +603,7 @@ setup_clear_cpu_cap(X86_FEATURE_CLWB); } -const struct cpu_dev intel_cpu_dev = { +const struct cpu_dev __initconst_cf_clobber intel_cpu_dev = { .c_early_init = early_init_intel, .c_init = init_intel, }; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/mcheck/mcaction.c xen-4.17.5/xen/arch/x86/cpu/mcheck/mcaction.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/mcheck/mcaction.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/cpu/mcheck/mcaction.c 2024-08-14 09:03:57.000000000 +0000 @@ -27,13 +27,6 @@ return rec; } -mce_check_addr_t mc_check_addr = NULL; - -void mce_register_addrcheck(mce_check_addr_t cbfunc) -{ - mc_check_addr = cbfunc; -} - void mc_memerr_dhandler(struct mca_binfo *binfo, enum mce_result *result, @@ -48,7 +41,8 @@ int vmce_vcpuid; unsigned int mc_vcpuid; - if ( !mc_check_addr(bank->mc_status, bank->mc_misc, MC_ADDR_PHYSICAL) ) + if ( !alternative_call(mce_callbacks.check_addr, bank->mc_status, + bank->mc_misc, MC_ADDR_PHYSICAL) ) { dprintk(XENLOG_WARNING, "No physical address provided for memory error\n"); diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/mcheck/mcaction.h xen-4.17.5/xen/arch/x86/cpu/mcheck/mcaction.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/mcheck/mcaction.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/cpu/mcheck/mcaction.h 2024-08-14 09:03:57.000000000 +0000 @@ -12,9 +12,4 @@ #define MC_ADDR_PHYSICAL 0 #define MC_ADDR_VIRTUAL 1 -typedef bool (*mce_check_addr_t)(uint64_t status, uint64_t misc, int addr_type); -extern void mce_register_addrcheck(mce_check_addr_t); - -extern mce_check_addr_t mc_check_addr; - #endif diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/mcheck/mce.c xen-4.17.5/xen/arch/x86/cpu/mcheck/mce.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/mcheck/mce.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/cpu/mcheck/mce.c 2024-08-14 09:03:57.000000000 +0000 @@ -82,47 +82,21 @@ fatal_trap(regs, 1); } -static x86_mce_vector_t _machine_check_vector = unexpected_machine_check; - -void x86_mce_vector_register(x86_mce_vector_t hdlr) -{ - _machine_check_vector = hdlr; -} +struct mce_callbacks __ro_after_init mce_callbacks = { + .handler = unexpected_machine_check, +}; +static const typeof(mce_callbacks.handler) __initconst_cf_clobber __used + default_handler = unexpected_machine_check; /* Call the installed machine check handler for this CPU setup. */ void do_machine_check(const struct cpu_user_regs *regs) { mce_enter(); - _machine_check_vector(regs); + alternative_vcall(mce_callbacks.handler, regs); mce_exit(); } -/* - * Init machine check callback handler - * It is used to collect additional information provided by newer - * CPU families/models without the need to duplicate the whole handler. - * This avoids having many handlers doing almost nearly the same and each - * with its own tweaks ands bugs. - */ -static x86_mce_callback_t mc_callback_bank_extended = NULL; - -void x86_mce_callback_register(x86_mce_callback_t cbfunc) -{ - mc_callback_bank_extended = cbfunc; -} - -/* - * Machine check recoverable judgement callback handler - * It is used to judge whether an UC error is recoverable by software - */ -static mce_recoverable_t mc_recoverable_scan = NULL; - -void mce_recoverable_register(mce_recoverable_t cbfunc) -{ - mc_recoverable_scan = cbfunc; -} - struct mca_banks *mcabanks_alloc(unsigned int nr_mce_banks) { struct mca_banks *mb; @@ -175,19 +149,6 @@ } /* - * Judging whether to Clear Machine Check error bank callback handler - * According to Intel latest MCA OS Recovery Writer's Guide, - * whether the error MCA bank needs to be cleared is decided by the mca_source - * and MCi_status bit value. - */ -static mce_need_clearbank_t mc_need_clearbank_scan = NULL; - -void mce_need_clearbank_register(mce_need_clearbank_t cbfunc) -{ - mc_need_clearbank_scan = cbfunc; -} - -/* * mce_logout_lock should only be used in the trap handler, * while MCIP has not been cleared yet in the global status * register. Other use is not safe, since an MCE trap can @@ -227,7 +188,8 @@ if ( (mib->mc_status & MCi_STATUS_MISCV) && (mib->mc_status & MCi_STATUS_ADDRV) && - (mc_check_addr(mib->mc_status, mib->mc_misc, MC_ADDR_PHYSICAL)) && + alternative_call(mce_callbacks.check_addr, mib->mc_status, + mib->mc_misc, MC_ADDR_PHYSICAL) && (who == MCA_POLLER || who == MCA_CMCI_HANDLER) && (mfn_valid(_mfn(paddr_to_pfn(mib->mc_addr)))) ) { @@ -327,7 +289,7 @@ * If no mc_recovery_scan callback handler registered, * this error is not recoverable */ - recover = mc_recoverable_scan ? 1 : 0; + recover = mce_callbacks.recoverable_scan; for ( i = 0; i < this_cpu(nr_mce_banks); i++ ) { @@ -344,8 +306,9 @@ * decide whether to clear bank by MCi_STATUS bit value such as * OVER/UC/EN/PCC/S/AR */ - if ( mc_need_clearbank_scan ) - need_clear = mc_need_clearbank_scan(who, status); + if ( mce_callbacks.need_clearbank_scan ) + need_clear = alternative_call(mce_callbacks.need_clearbank_scan, + who, status); /* * If this is the first bank with valid MCA DATA, then @@ -381,12 +344,12 @@ if ( recover && uc ) /* uc = true, recover = true, we need not panic. */ - recover = mc_recoverable_scan(status); + recover = alternative_call(mce_callbacks.recoverable_scan, status); mca_init_bank(who, mci, i); - if ( mc_callback_bank_extended ) - mc_callback_bank_extended(mci, i, status); + if ( mce_callbacks.info_collect ) + alternative_vcall(mce_callbacks.info_collect, mci, i, status); /* By default, need_clear = true */ if ( who != MCA_MCE_SCAN && need_clear ) @@ -799,7 +762,7 @@ { case X86_VENDOR_AMD: case X86_VENDOR_HYGON: - inited = amd_mcheck_init(c); + inited = amd_mcheck_init(c, bsp); break; case X86_VENDOR_INTEL: @@ -1913,12 +1876,11 @@ * will help to collect and log those MCE errors. * Round2: Do all MCE processing logic as normal. */ -void mce_handler_init(void) +void __init mce_handler_init(const struct mce_callbacks *cb) { - if ( smp_processor_id() != 0 ) - return; - /* callback register, do we really need so many callback? */ + mce_callbacks = *cb; + /* mce handler data initialization */ spin_lock_init(&mce_logout_lock); open_softirq(MACHINE_CHECK_SOFTIRQ, mce_softirq); diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/mcheck/mce.h xen-4.17.5/xen/arch/x86/cpu/mcheck/mce.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/mcheck/mce.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/cpu/mcheck/mce.h 2024-08-14 09:03:57.000000000 +0000 @@ -44,7 +44,7 @@ extern bool lmce_support; /* Init functions */ -enum mcheck_type amd_mcheck_init(struct cpuinfo_x86 *c); +enum mcheck_type amd_mcheck_init(const struct cpuinfo_x86 *c, bool bsp); enum mcheck_type intel_mcheck_init(struct cpuinfo_x86 *c, bool bsp); void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c); @@ -62,20 +62,12 @@ void x86_mc_get_cpu_info(unsigned, uint32_t *, uint16_t *, uint16_t *, uint32_t *, uint32_t *, uint32_t *, uint32_t *); -/* Register a handler for machine check exceptions. */ -typedef void (*x86_mce_vector_t)(const struct cpu_user_regs *regs); -extern void x86_mce_vector_register(x86_mce_vector_t); - /* * Common generic MCE handler that implementations may nominate * via x86_mce_vector_register. */ void cf_check mcheck_cmn_handler(const struct cpu_user_regs *regs); -/* Register a handler for judging whether mce is recoverable. */ -typedef bool (*mce_recoverable_t)(uint64_t status); -extern void mce_recoverable_register(mce_recoverable_t); - /* Read an MSR, checking for an interposed value first */ extern struct intpose_ent *intpose_lookup(unsigned int, uint64_t, uint64_t *); @@ -134,30 +126,6 @@ extern mctelem_cookie_t mcheck_mca_logout(enum mca_source, struct mca_banks *, struct mca_summary *, struct mca_banks *); -/* - * Register callbacks to be made during bank telemetry logout. - * Those callbacks are only available to those machine check handlers - * that call to the common mcheck_cmn_handler or who use the common - * telemetry logout function mcheck_mca_logout in error polling. - */ - -/* Register a handler for judging whether the bank need to be cleared */ -typedef bool (*mce_need_clearbank_t)(enum mca_source who, u64 status); -extern void mce_need_clearbank_register(mce_need_clearbank_t); - -/* - * Register a callback to collect additional information (typically non- - * architectural) provided by newer CPU families/models without the need - * to duplicate the whole handler resulting in various handlers each with - * its own tweaks and bugs. The callback receives an struct mc_info pointer - * which it can use with x86_mcinfo_reserve to add additional telemetry, - * the current MCA bank number we are reading telemetry from, and the - * MCi_STATUS value for that bank. - */ -typedef struct mcinfo_extended *(*x86_mce_callback_t) - (struct mc_info *, uint16_t, uint64_t); -extern void x86_mce_callback_register(x86_mce_callback_t); - void *x86_mcinfo_reserve(struct mc_info *mi, unsigned int size, unsigned int type); void x86_mcinfo_dump(struct mc_info *mi); @@ -198,8 +166,44 @@ return 0; } -/* MC softirq */ -void mce_handler_init(void); +struct mce_callbacks { + void (*handler)(const struct cpu_user_regs *regs); + bool (*check_addr)(uint64_t status, uint64_t misc, int addr_type); + + /* Handler for judging whether mce is recoverable. */ + bool (*recoverable_scan)(uint64_t status); + + /* + * Callbacks to be made during bank telemetry logout. + * They are only available to those machine check handlers + * that call to the common mcheck_cmn_handler or who use the common + * telemetry logout function mcheck_mca_logout in error polling. + */ + + /* + * Judging whether to Clear Machine Check error bank callback handler. + * According to Intel latest MCA OS Recovery Writer's Guide, whether + * the error MCA bank needs to be cleared is decided by the mca_source + * and MCi_status bit value. + */ + bool (*need_clearbank_scan)(enum mca_source who, u64 status); + + /* + * Callback to collect additional information (typically non- + * architectural) provided by newer CPU families/models without the need + * to duplicate the whole handler resulting in various handlers each with + * its own tweaks and bugs. The callback receives an struct mc_info pointer + * which it can use with x86_mcinfo_reserve to add additional telemetry, + * the current MCA bank number we are reading telemetry from, and the + * MCi_STATUS value for that bank. + */ + struct mcinfo_extended *(*info_collect) + (struct mc_info *mi, uint16_t bank, uint64_t status); +}; + +extern struct mce_callbacks mce_callbacks; + +void mce_handler_init(const struct mce_callbacks *cb); extern const struct mca_error_handler *mce_dhandlers; extern const struct mca_error_handler *mce_uhandlers; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/mcheck/mce_amd.c xen-4.17.5/xen/arch/x86/cpu/mcheck/mce_amd.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/mcheck/mce_amd.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/cpu/mcheck/mce_amd.c 2024-08-14 09:03:57.000000000 +0000 @@ -283,8 +283,21 @@ return 1; } +static const struct mce_callbacks __initconst_cf_clobber k8_callbacks = { + .handler = mcheck_cmn_handler, + .need_clearbank_scan = amd_need_clearbank_scan, +}; + +static const struct mce_callbacks __initconst_cf_clobber k10_callbacks = { + .handler = mcheck_cmn_handler, + .check_addr = mc_amd_addrcheck, + .recoverable_scan = mc_amd_recoverable_scan, + .need_clearbank_scan = amd_need_clearbank_scan, + .info_collect = amd_f10_handler, +}; + enum mcheck_type -amd_mcheck_init(struct cpuinfo_x86 *ci) +amd_mcheck_init(const struct cpuinfo_x86 *ci, bool bsp) { uint32_t i; enum mcequirk_amd_flags quirkflag = 0; @@ -294,9 +307,8 @@ /* Assume that machine check support is available. * The minimum provided support is at least the K8. */ - mce_handler_init(); - x86_mce_vector_register(mcheck_cmn_handler); - mce_need_clearbank_register(amd_need_clearbank_scan); + if ( bsp ) + mce_handler_init(ci->x86 == 0xf ? &k8_callbacks : &k10_callbacks); for ( i = 0; i < this_cpu(nr_mce_banks); i++ ) { @@ -336,10 +348,6 @@ ppin_msr = MSR_AMD_PPIN; } - x86_mce_callback_register(amd_f10_handler); - mce_recoverable_register(mc_amd_recoverable_scan); - mce_register_addrcheck(mc_amd_addrcheck); - return ci->x86_vendor == X86_VENDOR_HYGON ? mcheck_hygon : mcheck_amd_famXX; } diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/mcheck/mce_intel.c xen-4.17.5/xen/arch/x86/cpu/mcheck/mce_intel.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/mcheck/mce_intel.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/cpu/mcheck/mce_intel.c 2024-08-14 09:03:57.000000000 +0000 @@ -814,7 +814,7 @@ return; } -static void intel_init_mce(void) +static void intel_init_mce(bool bsp) { uint64_t msr_content; int i; @@ -840,10 +840,8 @@ if ( firstbank ) /* if cmci enabled, firstbank = 0 */ wrmsrl(MSR_IA32_MC0_STATUS, 0x0ULL); - x86_mce_vector_register(mcheck_cmn_handler); - mce_recoverable_register(intel_recoverable_scan); - mce_need_clearbank_register(intel_need_clearbank_scan); - mce_register_addrcheck(intel_checkaddr); + if ( !bsp ) + return; mce_dhandlers = intel_mce_dhandlers; mce_dhandler_num = ARRAY_SIZE(intel_mce_dhandlers); @@ -954,6 +952,13 @@ return !rc ? NOTIFY_DONE : notifier_from_errno(rc); } +static const struct mce_callbacks __initconst_cf_clobber intel_callbacks = { + .handler = mcheck_cmn_handler, + .check_addr = intel_checkaddr, + .recoverable_scan = intel_recoverable_scan, + .need_clearbank_scan = intel_need_clearbank_scan, +}; + static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback }; @@ -979,9 +984,10 @@ intel_init_mca(c); - mce_handler_init(); + if ( bsp ) + mce_handler_init(&intel_callbacks); - intel_init_mce(); + intel_init_mce(bsp); intel_init_cmci(c); diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/microcode/amd.c xen-4.17.5/xen/arch/x86/cpu/microcode/amd.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/microcode/amd.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/cpu/microcode/amd.c 2024-08-14 09:03:57.000000000 +0000 @@ -222,12 +222,15 @@ uint32_t rev, old_rev = sig->rev; enum microcode_match_result result = microcode_fits(patch); + if ( result == MIS_UCODE ) + return -EINVAL; + /* * Allow application of the same revision to pick up SMT-specific changes * even if the revision of the other SMT thread is already up-to-date. */ - if ( result != NEW_UCODE && result != SAME_UCODE ) - return -EINVAL; + if ( result == OLD_UCODE ) + return -EEXIST; if ( check_final_patch_levels(sig) ) { diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/microcode/core.c xen-4.17.5/xen/arch/x86/cpu/microcode/core.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/microcode/core.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/cpu/microcode/core.c 2024-08-14 09:03:57.000000000 +0000 @@ -637,7 +637,7 @@ "microcode: couldn't find any newer%s revision in the provided blob!\n", opt_ucode_allow_same ? " (or the same)" : ""); microcode_free_patch(patch); - ret = -ENOENT; + ret = -EEXIST; goto put; } @@ -819,6 +819,8 @@ alternative_vcall(ucode_ops.collect_cpu_info); + printk(XENLOG_INFO "BSP microcode revision: 0x%08x\n", this_cpu(cpu_sig).rev); + if ( ucode_mod.mod_end || ucode_blob.size ) rc = early_microcode_update_cpu(); diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/microcode/intel.c xen-4.17.5/xen/arch/x86/cpu/microcode/intel.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/microcode/intel.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/cpu/microcode/intel.c 2024-08-14 09:03:57.000000000 +0000 @@ -294,10 +294,13 @@ result = microcode_update_match(patch); - if ( result != NEW_UCODE && - !(opt_ucode_allow_same && result == SAME_UCODE) ) + if ( result == MIS_UCODE ) return -EINVAL; + if ( result == OLD_UCODE || + (result == SAME_UCODE && !opt_ucode_allow_same) ) + return -EEXIST; + wbinvd(); wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)patch->data); diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/mtrr/generic.c xen-4.17.5/xen/arch/x86/cpu/mtrr/generic.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/mtrr/generic.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/cpu/mtrr/generic.c 2024-08-14 09:03:57.000000000 +0000 @@ -287,7 +287,7 @@ } } -int cf_check generic_get_free_region( +int mtrr_get_free_region( unsigned long base, unsigned long size, int replace_reg) /* [SUMMARY] Get a free MTRR. The starting (base) address of the region. @@ -303,14 +303,14 @@ if (replace_reg >= 0 && replace_reg < max) return replace_reg; for (i = 0; i < max; ++i) { - mtrr_if->get(i, &lbase, &lsize, <ype); + mtrr_get(i, &lbase, &lsize, <ype); if (lsize == 0) return i; } return -ENOSPC; } -static void cf_check generic_get_mtrr( +void mtrr_get( unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type *type) { uint64_t _mask, _base; @@ -500,7 +500,7 @@ spin_unlock(&set_atomicity_lock); } -static void cf_check generic_set_all(void) +void mtrr_set_all(void) { unsigned long mask, count; unsigned long flags; @@ -523,7 +523,7 @@ } } -static void cf_check generic_set_mtrr( +void mtrr_set( unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) /* [SUMMARY] Set variable MTRR register on the local CPU. The register to set. @@ -567,7 +567,7 @@ local_irq_restore(flags); } -int cf_check generic_validate_add_page( +int mtrr_validate_add_page( unsigned long base, unsigned long size, unsigned int type) { unsigned long lbase, last; @@ -586,21 +586,9 @@ } -static int cf_check generic_have_wrcomb(void) +bool mtrr_have_wrcomb(void) { unsigned long config; rdmsrl(MSR_MTRRcap, config); return (config & (1ULL << 10)); } - -/* generic structure... - */ -const struct mtrr_ops generic_mtrr_ops = { - .use_intel_if = true, - .set_all = generic_set_all, - .get = generic_get_mtrr, - .get_free_region = generic_get_free_region, - .set = generic_set_mtrr, - .validate_add_page = generic_validate_add_page, - .have_wrcomb = generic_have_wrcomb, -}; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/mtrr/main.c xen-4.17.5/xen/arch/x86/cpu/mtrr/main.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/mtrr/main.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/cpu/mtrr/main.c 2024-08-14 09:03:57.000000000 +0000 @@ -57,7 +57,7 @@ u64 __read_mostly size_or_mask; u64 __read_mostly size_and_mask; -const struct mtrr_ops *__read_mostly mtrr_if = NULL; +static bool __ro_after_init mtrr_if; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); @@ -78,23 +78,12 @@ return (x <= 6) ? mtrr_strings[x] : "?"; } -/* Returns non-zero if we have the write-combining memory type */ -static int have_wrcomb(void) -{ - return (mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0); -} - /* This function returns the number of variable MTRRs */ static void __init set_num_var_ranges(void) { - unsigned long config = 0; + unsigned long config; - if (use_intel()) { - rdmsrl(MSR_MTRRcap, config); - } else if (is_cpu(AMD)) - config = 2; - else if (is_cpu(CENTAUR)) - config = 8; + rdmsrl(MSR_MTRRcap, config); num_var_ranges = MASK_EXTR(config, MTRRcap_VCNT); } @@ -149,10 +138,10 @@ if (data->smp_reg == ~0U) /* update all mtrr registers */ /* At the cpu hot-add time this will reinitialize mtrr * registres on the existing cpus. It is ok. */ - mtrr_if->set_all(); + mtrr_set_all(); else /* single mtrr register update */ - mtrr_if->set(data->smp_reg, data->smp_base, - data->smp_size, data->smp_type); + mtrr_set(data->smp_reg, data->smp_base, + data->smp_size, data->smp_type); atomic_dec(&data->count); while(atomic_read(&data->gate)) @@ -198,10 +187,9 @@ * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait * until it hits 0 and proceed. We set the data.gate flag and reset data.count. * Meanwhile, they are waiting for that flag to be set. Once it's set, each - * CPU goes through the transition of updating MTRRs. The CPU vendors may each do it - * differently, so we call mtrr_if->set() callback and let them take care of it. - * When they're done, they again decrement data->count and wait for data.gate to - * be reset. + * CPU goes through the transition of updating MTRRs. + * When mtrr_set() is done, they again decrement data->count and wait for + * data.gate to be reset. * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag. * Everyone then enables interrupts and we all continue on. * @@ -251,9 +239,9 @@ if (reg == ~0U) /* update all mtrr registers */ /* at boot or resume time, this will reinitialize the mtrrs on * the bp. It is ok. */ - mtrr_if->set_all(); + mtrr_set_all(); else /* update the single mtrr register */ - mtrr_if->set(reg,base,size,type); + mtrr_set(reg, base, size, type); /* wait for the others */ while (atomic_read(&data.count)) @@ -319,7 +307,7 @@ if (!mtrr_if) return -ENXIO; - if ((error = mtrr_if->validate_add_page(base,size,type))) + if ((error = mtrr_validate_add_page(base, size, type))) return error; if (type >= MTRR_NUM_TYPES) { @@ -328,7 +316,7 @@ } /* If the type is WC, check that this processor supports it */ - if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { + if ((type == MTRR_TYPE_WRCOMB) && !mtrr_have_wrcomb()) { printk(KERN_WARNING "mtrr: your processor doesn't support write-combining\n"); return -EOPNOTSUPP; @@ -350,7 +338,7 @@ /* Search for existing MTRR */ mutex_lock(&mtrr_mutex); for (i = 0; i < num_var_ranges; ++i) { - mtrr_if->get(i, &lbase, &lsize, <ype); + mtrr_get(i, &lbase, &lsize, <ype); if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase) continue; /* At this point we know there is some kind of overlap/enclosure */ @@ -385,7 +373,7 @@ goto out; } /* Search for an empty MTRR */ - i = mtrr_if->get_free_region(base, size, replace); + i = mtrr_get_free_region(base, size, replace); if (i >= 0) { set_mtrr(i, base, size, type); if (likely(replace < 0)) @@ -494,7 +482,7 @@ if (reg < 0) { /* Search for existing MTRR */ for (i = 0; i < max; ++i) { - mtrr_if->get(i, &lbase, &lsize, <ype); + mtrr_get(i, &lbase, &lsize, <ype); if (lbase == base && lsize == size) { reg = i; break; @@ -510,7 +498,7 @@ printk(KERN_WARNING "mtrr: register: %d too big\n", reg); goto out; } - mtrr_if->get(reg, &lbase, &lsize, <ype); + mtrr_get(reg, &lbase, &lsize, <ype); if (lsize < 1) { printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); goto out; @@ -568,7 +556,7 @@ void __init mtrr_bp_init(void) { if (cpu_has_mtrr) { - mtrr_if = &generic_mtrr_ops; + mtrr_if = true; size_or_mask = ~((1ULL << (paddr_bits - PAGE_SHIFT)) - 1); size_and_mask = ~size_or_mask & 0xfffff00000ULL; } @@ -576,24 +564,24 @@ if (mtrr_if) { set_num_var_ranges(); init_table(); - if (use_intel()) - get_mtrr_state(); + get_mtrr_state(); } } void mtrr_ap_init(void) { - if (!mtrr_if || !use_intel() || hold_mtrr_updates_on_aps) + if (!mtrr_if || hold_mtrr_updates_on_aps) return; /* - * Ideally we should hold mtrr_mutex here to avoid mtrr entries changed, - * but this routine will be called in cpu boot time, holding the lock - * breaks it. This routine is called in two cases: 1.very earily time - * of software resume, when there absolutely isn't mtrr entry changes; - * 2.cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug lock to - * prevent mtrr entry changes + * hold_mtrr_updates_on_aps takes care of preventing unnecessary MTRR + * updates when batch starting the CPUs (see + * mtrr_aps_sync_{begin,end}()). + * + * Otherwise just apply the current system wide MTRR values to this AP. + * Note this doesn't require synchronization with the other CPUs, as + * there are strictly no modifications of the current MTRR values. */ - set_mtrr(~0U, 0, 0, 0); + mtrr_set_all(); } /** @@ -612,32 +600,25 @@ void mtrr_aps_sync_begin(void) { - if (!use_intel()) - return; hold_mtrr_updates_on_aps = 1; } void mtrr_aps_sync_end(void) { - if (!use_intel()) - return; set_mtrr(~0U, 0, 0, 0); hold_mtrr_updates_on_aps = 0; } void mtrr_bp_restore(void) { - if (!use_intel()) - return; - mtrr_if->set_all(); + mtrr_set_all(); } static int __init cf_check mtrr_init_finialize(void) { if (!mtrr_if) return 0; - if (use_intel()) - mtrr_state_warn(); + mtrr_state_warn(); return 0; } __initcall(mtrr_init_finialize); diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/mtrr/mtrr.h xen-4.17.5/xen/arch/x86/cpu/mtrr/mtrr.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/mtrr/mtrr.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/cpu/mtrr/mtrr.h 2024-08-14 09:03:57.000000000 +0000 @@ -6,40 +6,21 @@ #define MTRR_CHANGE_MASK_VARIABLE 0x02 #define MTRR_CHANGE_MASK_DEFTYPE 0x04 - -struct mtrr_ops { - u32 vendor; - bool use_intel_if; -// void (*init)(void); - void (*set)(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type); - void (*set_all)(void); - - void (*get)(unsigned int reg, unsigned long *base, - unsigned long *size, mtrr_type * type); - int (*get_free_region)(unsigned long base, unsigned long size, - int replace_reg); - int (*validate_add_page)(unsigned long base, unsigned long size, - unsigned int type); - int (*have_wrcomb)(void); -}; - -int cf_check generic_get_free_region( +void mtrr_get( + unsigned int reg, unsigned long *base, unsigned long *size, + mtrr_type *type); +void mtrr_set( + unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); +void mtrr_set_all(void); +int mtrr_get_free_region( unsigned long base, unsigned long size, int replace_reg); -int cf_check generic_validate_add_page( +int mtrr_validate_add_page( unsigned long base, unsigned long size, unsigned int type); - -extern const struct mtrr_ops generic_mtrr_ops; +bool mtrr_have_wrcomb(void); void get_mtrr_state(void); -extern void set_mtrr_ops(const struct mtrr_ops *); - extern u64 size_or_mask, size_and_mask; -extern const struct mtrr_ops *mtrr_if; - -#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) -#define use_intel() (mtrr_if && mtrr_if->use_intel_if) extern unsigned int num_var_ranges; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/shanghai.c xen-4.17.5/xen/arch/x86/cpu/shanghai.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu/shanghai.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/cpu/shanghai.c 2024-08-14 09:03:57.000000000 +0000 @@ -15,6 +15,6 @@ init_intel_cacheinfo(c); } -const struct cpu_dev shanghai_cpu_dev = { +const struct cpu_dev __initconst_cf_clobber shanghai_cpu_dev = { .c_init = init_shanghai, }; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu-policy.c xen-4.17.5/xen/arch/x86/cpu-policy.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/cpu-policy.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/cpu-policy.c 2024-08-14 09:03:57.000000000 +0000 @@ -436,6 +436,17 @@ __set_bit(X86_FEATURE_RRSBA, fs); /* + * These bits indicate that the VERW instruction may have gained + * scrubbing side effects. With pooling, they mean "you might migrate + * somewhere where scrubbing is necessary", and may need exposing on + * unaffected hardware. This is fine, because the VERW instruction + * has been around since the 286. + */ + __set_bit(X86_FEATURE_MD_CLEAR, fs); + __set_bit(X86_FEATURE_FB_CLEAR, fs); + __set_bit(X86_FEATURE_RFDS_CLEAR, fs); + + /* * The Gather Data Sampling microcode mitigation (August 2023) has an * adverse performance impact on the CLWB instruction on SKX/CLX/CPX. * @@ -447,6 +458,31 @@ raw_cpu_policy.feat.clwb ) __set_bit(X86_FEATURE_CLWB, fs); } + + /* + * Topology information inside the guest is entirely at the toolstack's + * discretion, and bears no relationship to the host we're running on. + * + * HTT identifies p->basic.lppp as valid + * CMP_LEGACY identifies p->extd.nc as valid + */ + __set_bit(X86_FEATURE_HTT, fs); + __set_bit(X86_FEATURE_CMP_LEGACY, fs); + + /* + * To mitigate Native-BHI, one option is to use a TSX Abort on capable + * systems. This is safe even if RTM has been disabled for other reasons + * via MSR_TSX_{CTRL,FORCE_ABORT}. However, a guest kernel doesn't get to + * know this type of information. + * + * Therefore the meaning of RTM_ALWAYS_ABORT has been adjusted, to instead + * mean "XBEGIN won't fault". This is enough for a guest kernel to make + * an informed choice WRT mitigating Native-BHI. + * + * If RTM-capable, we can run a VM which has seen RTM_ALWAYS_ABORT. + */ + if ( test_bit(X86_FEATURE_RTM, fs) ) + __set_bit(X86_FEATURE_RTM_ALWAYS_ABORT, fs); } static void __init guest_common_default_feature_adjustments(uint32_t *fs) @@ -470,6 +506,24 @@ __clear_bit(X86_FEATURE_RDRAND, fs); /* + * These bits indicate that the VERW instruction may have gained + * scrubbing side effects. The max policy has them set for migration + * reasons, so reset the default policy back to the host values in + * case we're unaffected. + */ + __clear_bit(X86_FEATURE_MD_CLEAR, fs); + if ( cpu_has_md_clear ) + __set_bit(X86_FEATURE_MD_CLEAR, fs); + + __clear_bit(X86_FEATURE_FB_CLEAR, fs); + if ( cpu_has_fb_clear ) + __set_bit(X86_FEATURE_FB_CLEAR, fs); + + __clear_bit(X86_FEATURE_RFDS_CLEAR, fs); + if ( cpu_has_rfds_clear ) + __set_bit(X86_FEATURE_RFDS_CLEAR, fs); + + /* * The Gather Data Sampling microcode mitigation (August 2023) has an * adverse performance impact on the CLWB instruction on SKX/CLX/CPX. * @@ -484,14 +538,31 @@ } /* + * Topology information is at the toolstack's discretion so these are + * unconditionally set in max, but pick a default which matches the host. + */ + __clear_bit(X86_FEATURE_HTT, fs); + if ( cpu_has_htt ) + __set_bit(X86_FEATURE_HTT, fs); + + __clear_bit(X86_FEATURE_CMP_LEGACY, fs); + if ( cpu_has_cmp_legacy ) + __set_bit(X86_FEATURE_CMP_LEGACY, fs); + + /* * On certain hardware, speculative or errata workarounds can result in * TSX being placed in "force-abort" mode, where it doesn't actually * function as expected, but is technically compatible with the ISA. * * Do not advertise RTM to guests by default if it won't actually work. + * Instead, advertise RTM_ALWAYS_ABORT indicating that TSX Aborts are safe + * to use, e.g. for mitigating Native-BHI. */ if ( rtm_disabled ) + { __clear_bit(X86_FEATURE_RTM, fs); + __set_bit(X86_FEATURE_RTM_ALWAYS_ABORT, fs); + } } static void __init guest_common_feature_adjustments(uint32_t *fs) @@ -525,12 +596,27 @@ unsigned int i; *p = host_cpu_policy; + + /* + * Some VMs may have a larger-than-necessary feat max_subleaf. Allow them + * to migrate in. + */ + p->feat.max_subleaf = ARRAY_SIZE(p->feat.raw) - 1; + x86_cpu_policy_to_featureset(p, fs); for ( i = 0; i < ARRAY_SIZE(fs); ++i ) fs[i] &= pv_max_featuremask[i]; /* + * Xen at the time of writing (Feb 2024, 4.19 dev cycle) used to leak the + * host x2APIC capability into PV guests, but never supported the guest + * trying to turn x2APIC mode on. Tolerate an incoming VM which saw the + * x2APIC CPUID bit and is alive enough to migrate. + */ + __set_bit(X86_FEATURE_X2APIC, fs); + + /* * If Xen isn't virtualising MSR_SPEC_CTRL for PV guests (functional * availability, or admin choice), hide the feature. */ @@ -557,6 +643,10 @@ unsigned int i; *p = pv_max_cpu_policy; + + /* Default to the same max_subleaf as the host. */ + p->feat.max_subleaf = host_cpu_policy.feat.max_subleaf; + x86_cpu_policy_to_featureset(p, fs); for ( i = 0; i < ARRAY_SIZE(fs); ++i ) @@ -593,6 +683,13 @@ const uint32_t *mask; *p = host_cpu_policy; + + /* + * Some VMs may have a larger-than-necessary feat max_subleaf. Allow them + * to migrate in. + */ + p->feat.max_subleaf = ARRAY_SIZE(p->feat.raw) - 1; + x86_cpu_policy_to_featureset(p, fs); mask = hvm_hap_supported() ? @@ -679,6 +776,10 @@ const uint32_t *mask; *p = hvm_max_cpu_policy; + + /* Default to the same max_subleaf as the host. */ + p->feat.max_subleaf = host_cpu_policy.feat.max_subleaf; + x86_cpu_policy_to_featureset(p, fs); mask = hvm_hap_supported() ? @@ -808,14 +909,6 @@ } /* - * Allow the toolstack to set HTT, X2APIC and CMP_LEGACY. These bits - * affect how to interpret topology information in other cpuid leaves. - */ - __set_bit(X86_FEATURE_HTT, max_fs); - __set_bit(X86_FEATURE_X2APIC, max_fs); - __set_bit(X86_FEATURE_CMP_LEGACY, max_fs); - - /* * 32bit PV domains can't use any Long Mode features, and cannot use * SYSCALL on non-AMD hardware. */ diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/cpuid.c xen-4.17.5/xen/arch/x86/cpuid.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/cpuid.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/cpuid.c 2024-08-14 09:03:57.000000000 +0000 @@ -330,24 +330,20 @@ case XSTATE_CPUID: switch ( subleaf ) { - case 1: - if ( p->xstate.xsavec || p->xstate.xsaves ) - { - /* - * TODO: Figure out what to do for XSS state. VT-x manages - * host vs guest MSR_XSS automatically, so as soon as we start - * supporting any XSS states, the wrong XSS will be in - * context. - */ - BUILD_BUG_ON(XSTATE_XSAVES_ONLY != 0); - - /* - * Read CPUID[0xD,0/1].EBX from hardware. They vary with - * enabled XSTATE, and appropraite XCR0|XSS are in context. - */ + /* + * Read CPUID[0xd,0/1].EBX from hardware. They vary with enabled + * XSTATE, and the appropriate XCR0 is in context. + */ case 0: - res->b = cpuid_count_ebx(leaf, subleaf); - } + if ( p->basic.xsave ) + res->b = cpuid_count_ebx(0xd, 0); + break; + + case 1: + /* This only works because Xen doesn't support XSS states yet. */ + BUILD_BUG_ON(XSTATE_XSAVES_ONLY != 0); + if ( p->xstate.xsavec ) + res->b = cpuid_count_ebx(0xd, 1); break; } break; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/domain.c xen-4.17.5/xen/arch/x86/domain.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/domain.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/domain.c 2024-08-14 09:03:57.000000000 +0000 @@ -833,7 +833,8 @@ } else if ( is_pv_domain(d) ) { - mapcache_domain_init(d); + if ( (rc = mapcache_domain_init(d)) != 0 ) + goto fail; if ( (rc = pv_domain_initialise(d)) != 0 ) goto fail; @@ -2096,10 +2097,10 @@ } } - /* Update the top-of-stack block with the new spec_ctrl settings. */ - info->spec_ctrl_flags = - (info->spec_ctrl_flags & ~SCF_DOM_MASK) | - (nextd->arch.spec_ctrl_flags & SCF_DOM_MASK); + /* Update the top-of-stack block with the new speculation settings. */ + info->scf = + (info->scf & ~SCF_DOM_MASK) | + (nextd->arch.scf & SCF_DOM_MASK); } sched_context_switched(prev, next); @@ -2112,12 +2113,12 @@ /* Ensure that the vcpu has an up-to-date time base. */ update_vcpu_system_time(next); - reset_stack_and_jump_ind(nextd->arch.ctxt_switch->tail); + reset_stack_and_call_ind(nextd->arch.ctxt_switch->tail); } void continue_running(struct vcpu *same) { - reset_stack_and_jump_ind(same->domain->arch.ctxt_switch->tail); + reset_stack_and_call_ind(same->domain->arch.ctxt_switch->tail); } int __sync_local_execstate(void) diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/extable.c xen-4.17.5/xen/arch/x86/extable.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/extable.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/extable.c 2024-08-14 09:03:57.000000000 +0000 @@ -86,26 +86,29 @@ } unsigned long -search_exception_table(const struct cpu_user_regs *regs) +search_exception_table(const struct cpu_user_regs *regs, unsigned long *stub_ra) { const struct virtual_region *region = find_text_region(regs->rip); unsigned long stub = this_cpu(stubs.addr); if ( region && region->ex ) + { + *stub_ra = 0; return search_one_extable(region->ex, region->ex_end, regs->rip); + } if ( regs->rip >= stub + STUB_BUF_SIZE / 2 && regs->rip < stub + STUB_BUF_SIZE && regs->rsp > (unsigned long)regs && regs->rsp < (unsigned long)get_cpu_info() ) { - unsigned long retptr = *(unsigned long *)regs->rsp; + unsigned long retaddr = *(unsigned long *)regs->rsp, fixup; - region = find_text_region(retptr); - retptr = region && region->ex - ? search_one_extable(region->ex, region->ex_end, retptr) - : 0; - if ( retptr ) + region = find_text_region(retaddr); + fixup = region && region->ex + ? search_one_extable(region->ex, region->ex_end, retaddr) + : 0; + if ( fixup ) { /* * Put trap number and error code on the stack (in place of the @@ -117,17 +120,19 @@ }; *(unsigned long *)regs->rsp = token.raw; - return retptr; + *stub_ra = retaddr; + return fixup; } } return 0; } -#ifndef NDEBUG +#ifdef CONFIG_DEBUG +#include #include -static int __init cf_check stub_selftest(void) +int __init cf_check stub_selftest(void) { static const struct { uint8_t opc[8]; @@ -151,7 +156,8 @@ unsigned int i; bool fail = false; - printk("Running stub recovery selftests...\n"); + printk("%s stub recovery selftests...\n", + system_state < SYS_STATE_active ? "Running" : "Re-running"); for ( i = 0; i < ARRAY_SIZE(tests); ++i ) { diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/genapic/bigsmp.c xen-4.17.5/xen/arch/x86/genapic/bigsmp.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/genapic/bigsmp.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/genapic/bigsmp.c 2024-08-14 09:03:57.000000000 +0000 @@ -41,7 +41,7 @@ return def_to_bigsmp; } -const struct genapic __initconstrel apic_bigsmp = { +const struct genapic __initconst_cf_clobber apic_bigsmp = { APIC_INIT("bigsmp", probe_bigsmp), GENAPIC_PHYS }; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/genapic/default.c xen-4.17.5/xen/arch/x86/genapic/default.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/genapic/default.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/genapic/default.c 2024-08-14 09:03:57.000000000 +0000 @@ -14,7 +14,7 @@ #include /* should be called last. */ -const struct genapic __initconstrel apic_default = { +const struct genapic __initconst_cf_clobber apic_default = { APIC_INIT("default", NULL), GENAPIC_FLAT }; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/genapic/probe.c xen-4.17.5/xen/arch/x86/genapic/probe.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/genapic/probe.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/genapic/probe.c 2024-08-14 09:03:57.000000000 +0000 @@ -16,7 +16,7 @@ #include #include -struct genapic __read_mostly genapic; +struct genapic __ro_after_init genapic; static const struct genapic *const __initconstrel apic_probe[] = { &apic_bigsmp, diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/genapic/x2apic.c xen-4.17.5/xen/arch/x86/genapic/x2apic.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/genapic/x2apic.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/genapic/x2apic.c 2024-08-14 09:03:57.000000000 +0000 @@ -169,7 +169,7 @@ local_irq_restore(flags); } -static const struct genapic __initconstrel apic_x2apic_phys = { +static const struct genapic __initconst_cf_clobber apic_x2apic_phys = { APIC_INIT("x2apic_phys", NULL), .int_delivery_mode = dest_Fixed, .int_dest_mode = 0 /* physical delivery */, @@ -180,7 +180,7 @@ .send_IPI_self = send_IPI_self_x2apic }; -static const struct genapic __initconstrel apic_x2apic_cluster = { +static const struct genapic __initconst_cf_clobber apic_x2apic_cluster = { APIC_INIT("x2apic_cluster", NULL), .int_delivery_mode = dest_LowestPrio, .int_dest_mode = 1 /* logical delivery */, @@ -198,7 +198,7 @@ * IPIs to be more efficiently delivered by not having to perform an ICR write * for each target CPU. */ -static const struct genapic __initconstrel apic_x2apic_mixed = { +static const struct genapic __initconst_cf_clobber apic_x2apic_mixed = { APIC_INIT("x2apic_mixed", NULL), /* diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/guest/hyperv/hyperv.c xen-4.17.5/xen/arch/x86/guest/hyperv/hyperv.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/guest/hyperv/hyperv.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/guest/hyperv/hyperv.c 2024-08-14 09:03:57.000000000 +0000 @@ -219,7 +219,7 @@ return hyperv_flush_tlb(mask, va, flags); } -static const struct hypervisor_ops __initconstrel ops = { +static const struct hypervisor_ops __initconst_cf_clobber ops = { .name = "Hyper-V", .setup = setup, .ap_setup = ap_setup, diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/guest/hypervisor.c xen-4.17.5/xen/arch/x86/guest/hypervisor.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/guest/hypervisor.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/guest/hypervisor.c 2024-08-14 09:03:57.000000000 +0000 @@ -25,7 +25,7 @@ #include #include -static struct hypervisor_ops __read_mostly ops; +static struct hypervisor_ops __ro_after_init ops; const char *__init hypervisor_probe(void) { @@ -61,7 +61,7 @@ int hypervisor_ap_setup(void) { if ( ops.ap_setup ) - return ops.ap_setup(); + return alternative_call(ops.ap_setup); return 0; } @@ -69,7 +69,7 @@ void hypervisor_resume(void) { if ( ops.resume ) - ops.resume(); + alternative_vcall(ops.resume); } void __init hypervisor_e820_fixup(struct e820map *e820) diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/guest/xen/xen.c xen-4.17.5/xen/arch/x86/guest/xen/xen.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/guest/xen/xen.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/guest/xen/xen.c 2024-08-14 09:03:57.000000000 +0000 @@ -330,7 +330,7 @@ return xen_hypercall_hvm_op(HVMOP_flush_tlbs, NULL); } -static const struct hypervisor_ops __initconstrel ops = { +static const struct hypervisor_ops __initconst_cf_clobber ops = { .name = "Xen", .setup = setup, .ap_setup = ap_setup, diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/hpet.c xen-4.17.5/xen/arch/x86/hpet.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/hpet.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/hpet.c 2024-08-14 09:03:57.000000000 +0000 @@ -563,7 +563,7 @@ } } -void __init cf_check hpet_broadcast_init(void) +void __init hpet_broadcast_init(void) { u64 hpet_rate = hpet_setup(); u32 hpet_id, cfg; @@ -634,7 +634,7 @@ hpet_events->flags = HPET_EVT_LEGACY; } -void cf_check hpet_broadcast_resume(void) +void hpet_broadcast_resume(void) { u32 cfg; unsigned int i, n; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/hvm/emulate.c xen-4.17.5/xen/arch/x86/hvm/emulate.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/hvm/emulate.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/hvm/emulate.c 2024-08-14 09:03:57.000000000 +0000 @@ -697,7 +697,12 @@ out: /* Drop all held references. */ while ( mfn-- > hvmemul_ctxt->mfn ) + { put_page(mfn_to_page(*mfn)); +#ifndef NDEBUG /* Clean slot for a subsequent map()'s error checking. */ + *mfn = _mfn(0); +#endif + } return err; } @@ -719,7 +724,7 @@ for ( i = 0; i < nr_frames; i++ ) { - ASSERT(mfn_valid(*mfn)); + ASSERT(mfn_x(*mfn) && mfn_valid(*mfn)); paging_mark_dirty(currd, *mfn); put_page(mfn_to_page(*mfn)); diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/hvm/hvm.c xen-4.17.5/xen/arch/x86/hvm/hvm.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/hvm/hvm.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/hvm/hvm.c 2024-08-14 09:03:57.000000000 +0000 @@ -3524,7 +3524,7 @@ fixed_range_base = (uint64_t *)v->arch.hvm.mtrr.fixed_ranges; if ( (ret = guest_rdmsr(v, msr, msr_content)) != X86EMUL_UNHANDLEABLE ) - return ret; + goto out; ret = X86EMUL_OKAY; @@ -5153,26 +5153,40 @@ int hvm_debug_op(struct vcpu *v, int32_t op) { - int rc; + int rc = 0; switch ( op ) { case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON: case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF: - rc = -EOPNOTSUPP; if ( !cpu_has_monitor_trap_flag ) - break; - rc = 0; - vcpu_pause(v); - v->arch.hvm.single_step = - (op == XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON); - vcpu_unpause(v); /* guest will latch new state */ + return -EOPNOTSUPP; break; default: - rc = -ENOSYS; - break; + return -ENOSYS; } + vcpu_pause(v); + + switch ( op ) + { + case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON: + v->arch.hvm.single_step = true; + break; + + case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF: + v->arch.hvm.single_step = false; + v->arch.hvm.fast_single_step.enabled = false; + v->arch.hvm.fast_single_step.p2midx = 0; + break; + + default: /* Excluded above */ + ASSERT_UNREACHABLE(); + return -ENOSYS; + } + + vcpu_unpause(v); /* guest will latch new state */ + return rc; } diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/hvm/rtc.c xen-4.17.5/xen/arch/x86/hvm/rtc.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/hvm/rtc.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/hvm/rtc.c 2024-08-14 09:03:57.000000000 +0000 @@ -203,6 +203,7 @@ } else { + s->hw.cmos_data[RTC_REG_A] &= ~RTC_UIP; next_update_time = (USEC_PER_SEC - guest_usec - 244) * NS_PER_USEC; expire_time = NOW() + next_update_time; s->next_update_time = expire_time; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/hvm/svm/entry.S xen-4.17.5/xen/arch/x86/hvm/svm/entry.S --- xen-4.17.3+10-g091466ba55/xen/arch/x86/hvm/svm/entry.S 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/hvm/svm/entry.S 2024-08-14 09:03:57.000000000 +0000 @@ -63,14 +63,14 @@ /* SPEC_CTRL_EXIT_TO_SVM Req: b=curr %rsp=regs/cpuinfo, Clob: acd */ .macro svm_vmentry_spec_ctrl mov VCPU_arch_msrs(%rbx), %rax - movzbl CPUINFO_last_spec_ctrl(%rsp), %edx + mov CPUINFO_last_spec_ctrl(%rsp), %edx mov VCPUMSR_spec_ctrl_raw(%rax), %eax cmp %edx, %eax je 1f /* Skip write if value is correct. */ mov $MSR_SPEC_CTRL, %ecx xor %edx, %edx wrmsr - mov %al, CPUINFO_last_spec_ctrl(%rsp) + mov %eax, CPUINFO_last_spec_ctrl(%rsp) 1: /* No Spectre v1 concerns. Execution will hit VMRUN imminently. */ .endm ALTERNATIVE "", svm_vmentry_spec_ctrl, X86_FEATURE_SC_MSR_HVM @@ -102,8 +102,13 @@ /* SPEC_CTRL_ENTRY_FROM_SVM Req: %rsp=regs/cpuinfo, %rdx=0 Clob: acd */ + /* + * IBPB is to mitigate BTC/SRSO on AMD/Hygon parts, in particular + * making type-confused RETs safe to use. This is not needed on Zen5 + * and later parts when SRSO_MSR_FIX (BP-SPEC-REDUCE) is in use. + */ .macro svm_vmexit_cond_ibpb - testb $SCF_entry_ibpb, CPUINFO_xen_spec_ctrl(%rsp) + testb $SCF_entry_ibpb, CPUINFO_scf(%rsp) jz .L_skip_ibpb mov $MSR_PRED_CMD, %ecx @@ -113,17 +118,26 @@ .endm ALTERNATIVE "", svm_vmexit_cond_ibpb, X86_FEATURE_IBPB_ENTRY_HVM + /* + * RSB (RAS/RAP) stuffing is to prevents RET predictions following guest + * entries. This is not needed on Zen4 and later, when AutoIBRS is in + * use. + */ ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_HVM + /* + * Restore Xen's MSR_SPEC_CTRL setting, making indirect CALLs/JMPs + * safe to use. The guest's setting resides in the VMCB. + */ .macro svm_vmexit_spec_ctrl - movzbl CPUINFO_xen_spec_ctrl(%rsp), %eax - movzbl CPUINFO_last_spec_ctrl(%rsp), %edx + mov CPUINFO_xen_spec_ctrl(%rsp), %eax + mov CPUINFO_last_spec_ctrl(%rsp), %edx cmp %edx, %eax je 1f /* Skip write if value is correct. */ mov $MSR_SPEC_CTRL, %ecx xor %edx, %edx wrmsr - mov %al, CPUINFO_last_spec_ctrl(%rsp) + mov %eax, CPUINFO_last_spec_ctrl(%rsp) 1: .endm ALTERNATIVE "", svm_vmexit_spec_ctrl, X86_FEATURE_SC_MSR_HVM diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/hvm/svm/svm.c xen-4.17.5/xen/arch/x86/hvm/svm/svm.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/hvm/svm/svm.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/hvm/svm/svm.c 2024-08-14 09:03:57.000000000 +0000 @@ -1669,6 +1669,7 @@ if ( _svm_cpu_up(true) ) { + setup_clear_cpu_cap(X86_FEATURE_SVM); printk("SVM: failed to initialise.\n"); return NULL; } @@ -2625,7 +2626,8 @@ regs->rsp = vmcb->rsp; regs->rflags = vmcb->rflags; - hvm_invalidate_regs_fields(regs); + hvm_sanitize_regs_fields( + regs, !(vmcb_get_efer(vmcb) & EFER_LMA) || !(vmcb->cs.l)); if ( paging_mode_hap(v->domain) ) v->arch.hvm.guest_cr[3] = v->arch.hvm.hw_cr[3] = vmcb_get_cr3(vmcb); diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/hvm/vmx/entry.S xen-4.17.5/xen/arch/x86/hvm/vmx/entry.S --- xen-4.17.3+10-g091466ba55/xen/arch/x86/hvm/vmx/entry.S 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/hvm/vmx/entry.S 2024-08-14 09:03:57.000000000 +0000 @@ -34,15 +34,42 @@ mov %rax,VCPU_hvm_guest_cr2(%rbx) /* SPEC_CTRL_ENTRY_FROM_VMX Req: b=curr %rsp=regs/cpuinfo, Clob: acd */ + /* + * RSB stuffing is to prevents RET predictions following guest + * entries. This is *not* sufficient to flush all RSB entries on + * parts enumerating eIBRS, although the following restore_spec_ctrl + * does covers us. + */ ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_HVM + /* + * Restore Xen's MSR_SPEC_CTRL setting. The guest's value resides in + * the MSR load/save list. For Legacy IBRS, this flushes/inhibits + * indirect predictions and does not flush the RSB. For eIBRS, this + * prevents CALLs/JMPs using predictions learnt at a lower predictor + * mode, and it flushes the RSB. On eIBRS parts that also suffer from + * PBRSB, the prior RSB stuffing suffices to make the RSB safe. + */ .macro restore_spec_ctrl mov $MSR_SPEC_CTRL, %ecx - movzbl CPUINFO_xen_spec_ctrl(%rsp), %eax + mov CPUINFO_xen_spec_ctrl(%rsp), %eax xor %edx, %edx wrmsr .endm ALTERNATIVE "", restore_spec_ctrl, X86_FEATURE_SC_MSR_HVM + + /* + * Clear the BHB to mitigate BHI. Used on eIBRS parts, and uses RETs + * itself so must be after we've perfomed all the RET-safety we can. + */ + testb $SCF_entry_bhb, CPUINFO_scf(%rsp) + jz .L_skip_bhb + ALTERNATIVE_2 "", \ + "call clear_bhb_loops", X86_SPEC_BHB_LOOPS, \ + "call clear_bhb_tsx", X86_SPEC_BHB_TSX +.L_skip_bhb: + + ALTERNATIVE "lfence", "", X86_SPEC_NO_LFENCE_ENTRY_VMX /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ /* Hardware clears MSR_DEBUGCTL on VMExit. Reinstate it if debugging Xen. */ @@ -87,17 +114,39 @@ /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ /* SPEC_CTRL_EXIT_TO_VMX Req: %rsp=regs/cpuinfo Clob: */ - DO_SPEC_CTRL_COND_VERW + /* + * All speculation safety work happens to be elsewhere. VERW is after + * popping the GPRs, while restoring the guest MSR_SPEC_CTRL is left + * to the MSR load list. + */ mov VCPU_hvm_guest_cr2(%rbx),%rax + mov %rax, %cr2 + + /* + * We need to perform two conditional actions (VERW, and Resume vs + * Launch) after popping GPRs. With some cunning, we can encode both + * of these in eflags together. + * + * Parity is only calculated over the bottom byte of the answer, while + * Sign is simply the top bit. + * + * Therefore, the final OR instruction ends up producing: + * SF = VCPU_vmx_launched + * PF = !SCF_verw + */ + BUILD_BUG_ON(SCF_verw & ~0xff) + movzbl VCPU_vmx_launched(%rbx), %ecx + shl $31, %ecx + movzbl CPUINFO_scf(%rsp), %eax + and $SCF_verw, %eax + or %eax, %ecx pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp - mov %rax,%cr2 - cmpb $0,VCPU_vmx_launched(%rbx) pop %rbx pop %r11 pop %r10 @@ -108,7 +157,13 @@ pop %rdx pop %rsi pop %rdi - je .Lvmx_launch + + jpe .L_skip_verw + /* VERW clobbers ZF, but preserves all others, including SF. */ + verw STK_REL(CPUINFO_verw_sel, CPUINFO_error_code)(%rsp) +.L_skip_verw: + + jns .Lvmx_launch /*.Lvmx_resume:*/ VMRESUME diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/hvm/vmx/vmcs.c xen-4.17.5/xen/arch/x86/hvm/vmx/vmcs.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/hvm/vmx/vmcs.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/hvm/vmx/vmcs.c 2024-08-14 09:03:57.000000000 +0000 @@ -176,6 +176,7 @@ u32 vmx_pin_based_exec_control __read_mostly; u32 vmx_cpu_based_exec_control __read_mostly; u32 vmx_secondary_exec_control __read_mostly; +uint64_t vmx_tertiary_exec_control __read_mostly; u32 vmx_vmexit_control __read_mostly; u32 vmx_vmentry_control __read_mostly; u64 vmx_ept_vpid_cap __read_mostly; @@ -214,6 +215,7 @@ P(cpu_has_vmx_tsc_scaling, "TSC Scaling"); P(cpu_has_vmx_bus_lock_detection, "Bus Lock Detection"); P(cpu_has_vmx_notify_vm_exiting, "Notify VM Exit"); + P(cpu_has_vmx_virt_spec_ctrl, "Virtualize SPEC_CTRL"); #undef P if ( !printed ) @@ -241,10 +243,32 @@ return ctl; } -static bool_t cap_check(const char *name, u32 expected, u32 saw) +static uint64_t adjust_vmx_controls2( + const char *name, uint64_t ctl_min, uint64_t ctl_opt, unsigned int msr, + bool *mismatch) +{ + uint64_t vmx_msr, ctl = ctl_min | ctl_opt; + + rdmsrl(msr, vmx_msr); + + ctl &= vmx_msr; /* bit == 0 ==> must be zero */ + + /* Ensure minimum (required) set of control bits are supported. */ + if ( ctl_min & ~ctl ) + { + *mismatch = true; + printk("VMX: CPU%u has insufficient %s (%#lx; requires %#lx)\n", + smp_processor_id(), name, ctl, ctl_min); + } + + return ctl; +} + +static bool cap_check( + const char *name, unsigned long expected, unsigned long saw) { if ( saw != expected ) - printk("VMX %s: saw %#x expected %#x\n", name, saw, expected); + printk("VMX %s: saw %#lx expected %#lx\n", name, saw, expected); return saw != expected; } @@ -254,6 +278,7 @@ u32 _vmx_pin_based_exec_control; u32 _vmx_cpu_based_exec_control; u32 _vmx_secondary_exec_control = 0; + uint64_t _vmx_tertiary_exec_control = 0; u64 _vmx_ept_vpid_cap = 0; u64 _vmx_misc_cap = 0; u32 _vmx_vmexit_control; @@ -287,7 +312,8 @@ opt = (CPU_BASED_ACTIVATE_MSR_BITMAP | CPU_BASED_TPR_SHADOW | CPU_BASED_MONITOR_TRAP_FLAG | - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS | + CPU_BASED_ACTIVATE_TERTIARY_CONTROLS); _vmx_cpu_based_exec_control = adjust_vmx_controls( "CPU-Based Exec Control", min, opt, MSR_IA32_VMX_PROCBASED_CTLS, &mismatch); @@ -351,6 +377,15 @@ MSR_IA32_VMX_PROCBASED_CTLS2, &mismatch); } + if ( _vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS ) + { + uint64_t opt = TERTIARY_EXEC_VIRT_SPEC_CTRL; + + _vmx_tertiary_exec_control = adjust_vmx_controls2( + "Tertiary Exec Control", 0, opt, + MSR_IA32_VMX_PROCBASED_CTLS3, &mismatch); + } + /* The IA32_VMX_EPT_VPID_CAP MSR exists only when EPT or VPID available */ if ( _vmx_secondary_exec_control & (SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_ENABLE_VPID) ) @@ -481,6 +516,7 @@ vmx_pin_based_exec_control = _vmx_pin_based_exec_control; vmx_cpu_based_exec_control = _vmx_cpu_based_exec_control; vmx_secondary_exec_control = _vmx_secondary_exec_control; + vmx_tertiary_exec_control = _vmx_tertiary_exec_control; vmx_ept_vpid_cap = _vmx_ept_vpid_cap; vmx_vmexit_control = _vmx_vmexit_control; vmx_vmentry_control = _vmx_vmentry_control; @@ -517,6 +553,9 @@ "Secondary Exec Control", vmx_secondary_exec_control, _vmx_secondary_exec_control); mismatch |= cap_check( + "Tertiary Exec Control", + vmx_tertiary_exec_control, _vmx_tertiary_exec_control); + mismatch |= cap_check( "VMExit Control", vmx_vmexit_control, _vmx_vmexit_control); mismatch |= cap_check( @@ -1092,6 +1131,7 @@ v->arch.hvm.vmx.exec_control |= CPU_BASED_RDTSC_EXITING; v->arch.hvm.vmx.secondary_exec_control = vmx_secondary_exec_control; + v->arch.hvm.vmx.tertiary_exec_control = vmx_tertiary_exec_control; /* * Disable features which we don't want active by default: @@ -1146,6 +1186,10 @@ __vmwrite(SECONDARY_VM_EXEC_CONTROL, v->arch.hvm.vmx.secondary_exec_control); + if ( cpu_has_vmx_tertiary_exec_control ) + __vmwrite(TERTIARY_VM_EXEC_CONTROL, + v->arch.hvm.vmx.tertiary_exec_control); + /* MSR access bitmap. */ if ( cpu_has_vmx_msr_bitmap ) { @@ -1334,6 +1378,12 @@ if ( cpu_has_vmx_tsc_scaling ) __vmwrite(TSC_MULTIPLIER, d->arch.hvm.tsc_scaling_ratio); + if ( cpu_has_vmx_virt_spec_ctrl ) + { + __vmwrite(SPEC_CTRL_MASK, 0); + __vmwrite(SPEC_CTRL_SHADOW, 0); + } + /* will update HOST & GUEST_CR3 as reqd */ paging_update_paging_modes(v); @@ -1343,7 +1393,7 @@ rc = vmx_add_msr(v, MSR_FLUSH_CMD, FLUSH_CMD_L1D, VMX_MSR_GUEST_LOADONLY); - if ( !rc && (d->arch.spec_ctrl_flags & SCF_entry_ibpb) ) + if ( !rc && (d->arch.scf & SCF_entry_ibpb) ) rc = vmx_add_msr(v, MSR_PRED_CMD, PRED_CMD_IBPB, VMX_MSR_HOST); @@ -2044,6 +2094,9 @@ if ( v->arch.hvm.vmx.secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY ) printk("InterruptStatus = %04x\n", vmr16(GUEST_INTR_STATUS)); + if ( cpu_has_vmx_virt_spec_ctrl ) + printk("SPEC_CTRL mask = 0x%016lx shadow = 0x%016lx\n", + vmr(SPEC_CTRL_MASK), vmr(SPEC_CTRL_SHADOW)); printk("*** Host State ***\n"); printk("RIP = 0x%016lx (%ps) RSP = 0x%016lx\n", @@ -2069,10 +2122,12 @@ vmr(HOST_PERF_GLOBAL_CTRL)); printk("*** Control State ***\n"); - printk("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", + printk("PinBased=%08x CPUBased=%08x\n", vmr32(PIN_BASED_VM_EXEC_CONTROL), - vmr32(CPU_BASED_VM_EXEC_CONTROL), - vmr32(SECONDARY_VM_EXEC_CONTROL)); + vmr32(CPU_BASED_VM_EXEC_CONTROL)); + printk("SecondaryExec=%08x TertiaryExec=%016lx\n", + vmr32(SECONDARY_VM_EXEC_CONTROL), + vmr(TERTIARY_VM_EXEC_CONTROL)); printk("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); printk("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", vmr32(EXCEPTION_BITMAP), @@ -2163,6 +2218,23 @@ if ( !ret ) register_keyhandler('v', vmcs_dump, "dump VT-x VMCSs", 1); + else + { + setup_clear_cpu_cap(X86_FEATURE_VMX); + + /* + * _vmx_vcpu_up() may have made it past feature identification. + * Make sure all dependent features are off as well. + */ + vmx_basic_msr = 0; + vmx_pin_based_exec_control = 0; + vmx_cpu_based_exec_control = 0; + vmx_secondary_exec_control = 0; + vmx_vmexit_control = 0; + vmx_vmentry_control = 0; + vmx_ept_vpid_cap = 0; + vmx_vmfunc = 0; + } return ret; } diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/hvm/vmx/vmx.c xen-4.17.5/xen/arch/x86/hvm/vmx/vmx.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/hvm/vmx/vmx.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/hvm/vmx/vmx.c 2024-08-14 09:03:57.000000000 +0000 @@ -58,6 +58,7 @@ #include #include #include +#include #include static bool_t __initdata opt_force_ept; @@ -725,6 +726,12 @@ v->arch.hvm.vmx.secondary_exec_control); } +void vmx_update_tertiary_exec_control(const struct vcpu *v) +{ + __vmwrite(TERTIARY_VM_EXEC_CONTROL, + v->arch.hvm.vmx.tertiary_exec_control); +} + void vmx_update_exception_bitmap(struct vcpu *v) { u32 bitmap = unlikely(v->arch.hvm.vmx.vmx_realmode) @@ -753,23 +760,44 @@ /* * We can safely pass MSR_SPEC_CTRL through to the guest, even if STIBP * isn't enumerated in hardware, as SPEC_CTRL_STIBP is ignored. + * + * If VMX_VIRT_SPEC_CTRL is available, it is activated by default and the + * guest MSR_SPEC_CTRL value lives in the VMCS. Otherwise, it lives in + * the MSR load/save list. */ if ( cp->feat.ibrsb ) { vmx_clear_msr_intercept(v, MSR_SPEC_CTRL, VMX_MSR_RW); - rc = vmx_add_guest_msr(v, MSR_SPEC_CTRL, 0); - if ( rc ) - goto out; + if ( !cpu_has_vmx_virt_spec_ctrl ) + { + rc = vmx_add_guest_msr(v, MSR_SPEC_CTRL, 0); + if ( rc ) + goto out; + } } else { vmx_set_msr_intercept(v, MSR_SPEC_CTRL, VMX_MSR_RW); - rc = vmx_del_msr(v, MSR_SPEC_CTRL, VMX_MSR_GUEST); - if ( rc && rc != -ESRCH ) - goto out; - rc = 0; /* Tolerate -ESRCH */ + if ( !cpu_has_vmx_virt_spec_ctrl ) + vmx_del_msr(v, MSR_SPEC_CTRL, VMX_MSR_GUEST); + } + + if ( cpu_has_vmx_virt_spec_ctrl ) + { + /* + * If we're on BHI_DIS_S capable hardware, the short loop sequence is + * not sufficient to mitigate Native-BHI. If the VM can't see it + * (i.e. it's levelled with older hardware), force it behind the + * guests back for safey. + * + * Because there's not a real Host/Guest split of the MSR_SPEC_CTRL + * value, this only works as expected when Xen is using BHI_DIS_S too. + */ + bool force_bhi_dis_s = opt_bhi_dis_s && !cp->feat.bhi_ctrl; + + __vmwrite(SPEC_CTRL_MASK, force_bhi_dis_s ? SPEC_CTRL_BHI_DIS_S : 0); } /* MSR_PRED_CMD is safe to pass through if the guest knows about it. */ @@ -2586,6 +2614,10 @@ switch ( reg ) { case MSR_SPEC_CTRL: + if ( cpu_has_vmx_virt_spec_ctrl ) + /* Guest value in VMCS - fetched below. */ + break; + rc = vmx_read_guest_msr(v, reg, &val); if ( rc ) { @@ -2606,6 +2638,11 @@ vmx_vmcs_enter(v); switch ( reg ) { + case MSR_SPEC_CTRL: + ASSERT(cpu_has_vmx_virt_spec_ctrl); + __vmread(SPEC_CTRL_SHADOW, &val); + break; + case MSR_IA32_BNDCFGS: __vmread(GUEST_BNDCFGS, &val); break; @@ -2630,6 +2667,10 @@ switch ( reg ) { case MSR_SPEC_CTRL: + if ( cpu_has_vmx_virt_spec_ctrl ) + /* Guest value in VMCS - set below. */ + break; + rc = vmx_write_guest_msr(v, reg, val); if ( rc ) { @@ -2644,6 +2685,11 @@ vmx_vmcs_enter(v); switch ( reg ) { + case MSR_SPEC_CTRL: + ASSERT(cpu_has_vmx_virt_spec_ctrl); + __vmwrite(SPEC_CTRL_SHADOW, val); + break; + case MSR_IA32_BNDCFGS: __vmwrite(GUEST_BNDCFGS, val); break; @@ -3981,6 +4027,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) { unsigned long exit_qualification, exit_reason, idtv_info, intr_info = 0; + unsigned long cs_ar_bytes = 0; unsigned int vector = 0; struct vcpu *v = current; struct domain *currd = v->domain; @@ -3989,7 +4036,10 @@ __vmread(GUEST_RSP, ®s->rsp); __vmread(GUEST_RFLAGS, ®s->rflags); - hvm_invalidate_regs_fields(regs); + if ( hvm_long_mode_active(v) ) + __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes); + + hvm_sanitize_regs_fields(regs, !(cs_ar_bytes & X86_SEG_AR_CS_LM_ACTIVE)); if ( paging_mode_hap(v->domain) ) { diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/hvm/vpt.c xen-4.17.5/xen/arch/x86/hvm/vpt.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/hvm/vpt.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/hvm/vpt.c 2024-08-14 09:03:57.000000000 +0000 @@ -161,7 +161,7 @@ * pt->vcpu field, because another thread holding the pt_migrate lock * may already be spinning waiting for your vcpu lock. */ -static void pt_vcpu_lock(struct vcpu *v) +static always_inline void pt_vcpu_lock(struct vcpu *v) { spin_lock(&v->arch.hvm.tm_lock); } @@ -180,9 +180,13 @@ * need to take an additional lock that protects against pt->vcpu * changing. */ -static void pt_lock(struct periodic_time *pt) +static always_inline void pt_lock(struct periodic_time *pt) { - read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate); + /* + * Use the speculation unsafe variant for the first lock, as the following + * lock taking helper already includes a speculation barrier. + */ + _read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate); spin_lock(&pt->vcpu->arch.hvm.tm_lock); } diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/hypercall.c xen-4.17.5/xen/arch/x86/hypercall.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/hypercall.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/hypercall.c 2024-08-14 09:03:57.000000000 +0000 @@ -152,8 +152,13 @@ cval = va_arg(args, unsigned int); if ( cval == nval ) mask &= ~1U; - else - BUG_ON(nval == (unsigned int)nval); + else if ( nval == (unsigned int)nval ) + { + printk(XENLOG_G_ERR + "multicall (op %lu) bogus continuation arg%u (%#lx)\n", + mcs->call.op, i, nval); + domain_crash(current->domain); + } } else if ( id && *id == i ) { @@ -165,8 +170,13 @@ mcs->call.args[i] = cval; ++rc; } - else - BUG_ON(mcs->call.args[i] != (unsigned int)mcs->call.args[i]); + else if ( mcs->call.args[i] != (unsigned int)mcs->call.args[i] ) + { + printk(XENLOG_G_ERR + "multicall (op %lu) bad continuation arg%u (%#lx)\n", + mcs->call.op, i, mcs->call.args[i]); + domain_crash(current->domain); + } } } else @@ -192,8 +202,13 @@ cval = va_arg(args, unsigned int); if ( cval == nval ) mask &= ~1U; - else - BUG_ON(nval == (unsigned int)nval); + else if ( nval == (unsigned int)nval ) + { + printk(XENLOG_G_ERR + "hypercall (op %u) bogus continuation arg%u (%#lx)\n", + regs->eax, i, nval); + domain_crash(current->domain); + } } else if ( id && *id == i ) { @@ -205,8 +220,13 @@ *reg = cval; ++rc; } - else - BUG_ON(*reg != (unsigned int)*reg); + else if ( *reg != (unsigned int)*reg ) + { + printk(XENLOG_G_ERR + "hypercall (op %u) bad continuation arg%u (%#lx)\n", + regs->eax, i, *reg); + domain_crash(current->domain); + } } } diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/alternative.h xen-4.17.5/xen/arch/x86/include/asm/alternative.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/alternative.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/alternative.h 2024-08-14 09:03:57.000000000 +0000 @@ -167,9 +167,32 @@ #define ALT_CALL_arg5 "r8" #define ALT_CALL_arg6 "r9" +#ifdef CONFIG_CC_IS_CLANG +/* + * Clang doesn't follow the psABI and doesn't truncate parameter values at the + * callee. This can lead to bad code being generated when using alternative + * calls. + * + * Workaround it by using a temporary intermediate variable that's zeroed + * before being assigned the parameter value, as that forces clang to zero the + * register at the caller. + * + * This has been reported upstream: + * https://github.com/llvm/llvm-project/issues/12579 + * https://github.com/llvm/llvm-project/issues/82598 + */ +#define ALT_CALL_ARG(arg, n) \ + register unsigned long a ## n ## _ asm ( ALT_CALL_arg ## n ) = ({ \ + unsigned long tmp = 0; \ + BUILD_BUG_ON(sizeof(arg) > sizeof(unsigned long)); \ + *(typeof(arg) *)&tmp = (arg); \ + tmp; \ + }) +#else #define ALT_CALL_ARG(arg, n) \ register typeof(arg) a ## n ## _ asm ( ALT_CALL_arg ## n ) = \ ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); }) +#endif #define ALT_CALL_NO_ARG(n) \ register unsigned long a ## n ## _ asm ( ALT_CALL_arg ## n ) @@ -228,21 +251,24 @@ }) #define alternative_vcall1(func, arg) ({ \ - ALT_CALL_ARG(arg, 1); \ + typeof(arg) v1_ = (arg); \ + ALT_CALL_ARG(v1_, 1); \ ALT_CALL_NO_ARG2; \ (void)sizeof(func(arg)); \ (void)alternative_callN(1, int, func); \ }) #define alternative_call1(func, arg) ({ \ - ALT_CALL_ARG(arg, 1); \ + typeof(arg) v1_ = (arg); \ + ALT_CALL_ARG(v1_, 1); \ ALT_CALL_NO_ARG2; \ alternative_callN(1, typeof(func(arg)), func); \ }) #define alternative_vcall2(func, arg1, arg2) ({ \ + typeof(arg1) v1_ = (arg1); \ typeof(arg2) v2_ = (arg2); \ - ALT_CALL_ARG(arg1, 1); \ + ALT_CALL_ARG(v1_, 1); \ ALT_CALL_ARG(v2_, 2); \ ALT_CALL_NO_ARG3; \ (void)sizeof(func(arg1, arg2)); \ @@ -250,17 +276,19 @@ }) #define alternative_call2(func, arg1, arg2) ({ \ + typeof(arg1) v1_ = (arg1); \ typeof(arg2) v2_ = (arg2); \ - ALT_CALL_ARG(arg1, 1); \ + ALT_CALL_ARG(v1_, 1); \ ALT_CALL_ARG(v2_, 2); \ ALT_CALL_NO_ARG3; \ alternative_callN(2, typeof(func(arg1, arg2)), func); \ }) #define alternative_vcall3(func, arg1, arg2, arg3) ({ \ + typeof(arg1) v1_ = (arg1); \ typeof(arg2) v2_ = (arg2); \ typeof(arg3) v3_ = (arg3); \ - ALT_CALL_ARG(arg1, 1); \ + ALT_CALL_ARG(v1_, 1); \ ALT_CALL_ARG(v2_, 2); \ ALT_CALL_ARG(v3_, 3); \ ALT_CALL_NO_ARG4; \ @@ -269,9 +297,10 @@ }) #define alternative_call3(func, arg1, arg2, arg3) ({ \ + typeof(arg1) v1_ = (arg1); \ typeof(arg2) v2_ = (arg2); \ typeof(arg3) v3_ = (arg3); \ - ALT_CALL_ARG(arg1, 1); \ + ALT_CALL_ARG(v1_, 1); \ ALT_CALL_ARG(v2_, 2); \ ALT_CALL_ARG(v3_, 3); \ ALT_CALL_NO_ARG4; \ @@ -280,10 +309,11 @@ }) #define alternative_vcall4(func, arg1, arg2, arg3, arg4) ({ \ + typeof(arg1) v1_ = (arg1); \ typeof(arg2) v2_ = (arg2); \ typeof(arg3) v3_ = (arg3); \ typeof(arg4) v4_ = (arg4); \ - ALT_CALL_ARG(arg1, 1); \ + ALT_CALL_ARG(v1_, 1); \ ALT_CALL_ARG(v2_, 2); \ ALT_CALL_ARG(v3_, 3); \ ALT_CALL_ARG(v4_, 4); \ @@ -293,10 +323,11 @@ }) #define alternative_call4(func, arg1, arg2, arg3, arg4) ({ \ + typeof(arg1) v1_ = (arg1); \ typeof(arg2) v2_ = (arg2); \ typeof(arg3) v3_ = (arg3); \ typeof(arg4) v4_ = (arg4); \ - ALT_CALL_ARG(arg1, 1); \ + ALT_CALL_ARG(v1_, 1); \ ALT_CALL_ARG(v2_, 2); \ ALT_CALL_ARG(v3_, 3); \ ALT_CALL_ARG(v4_, 4); \ @@ -307,11 +338,12 @@ }) #define alternative_vcall5(func, arg1, arg2, arg3, arg4, arg5) ({ \ + typeof(arg1) v1_ = (arg1); \ typeof(arg2) v2_ = (arg2); \ typeof(arg3) v3_ = (arg3); \ typeof(arg4) v4_ = (arg4); \ typeof(arg5) v5_ = (arg5); \ - ALT_CALL_ARG(arg1, 1); \ + ALT_CALL_ARG(v1_, 1); \ ALT_CALL_ARG(v2_, 2); \ ALT_CALL_ARG(v3_, 3); \ ALT_CALL_ARG(v4_, 4); \ @@ -322,11 +354,12 @@ }) #define alternative_call5(func, arg1, arg2, arg3, arg4, arg5) ({ \ + typeof(arg1) v1_ = (arg1); \ typeof(arg2) v2_ = (arg2); \ typeof(arg3) v3_ = (arg3); \ typeof(arg4) v4_ = (arg4); \ typeof(arg5) v5_ = (arg5); \ - ALT_CALL_ARG(arg1, 1); \ + ALT_CALL_ARG(v1_, 1); \ ALT_CALL_ARG(v2_, 2); \ ALT_CALL_ARG(v3_, 3); \ ALT_CALL_ARG(v4_, 4); \ @@ -338,12 +371,13 @@ }) #define alternative_vcall6(func, arg1, arg2, arg3, arg4, arg5, arg6) ({ \ + typeof(arg1) v1_ = (arg1); \ typeof(arg2) v2_ = (arg2); \ typeof(arg3) v3_ = (arg3); \ typeof(arg4) v4_ = (arg4); \ typeof(arg5) v5_ = (arg5); \ typeof(arg6) v6_ = (arg6); \ - ALT_CALL_ARG(arg1, 1); \ + ALT_CALL_ARG(v1_, 1); \ ALT_CALL_ARG(v2_, 2); \ ALT_CALL_ARG(v3_, 3); \ ALT_CALL_ARG(v4_, 4); \ @@ -354,12 +388,13 @@ }) #define alternative_call6(func, arg1, arg2, arg3, arg4, arg5, arg6) ({ \ + typeof(arg1) v1_ = (arg1); \ typeof(arg2) v2_ = (arg2); \ typeof(arg3) v3_ = (arg3); \ typeof(arg4) v4_ = (arg4); \ typeof(arg5) v5_ = (arg5); \ typeof(arg6) v6_ = (arg6); \ - ALT_CALL_ARG(arg1, 1); \ + ALT_CALL_ARG(v1_, 1); \ ALT_CALL_ARG(v2_, 2); \ ALT_CALL_ARG(v3_, 3); \ ALT_CALL_ARG(v4_, 4); \ diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/apic.h xen-4.17.5/xen/arch/x86/include/asm/apic.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/apic.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/apic.h 2024-08-14 09:03:57.000000000 +0000 @@ -145,6 +145,11 @@ (vector & 0x1f)) & 1; } +static inline bool apic_irr_read(unsigned int vector) +{ + return apic_read(APIC_IRR + (vector / 32 * 0x10)) & (1U << (vector % 32)); +} + static __inline u32 get_apic_id(void) /* Get the physical APIC id */ { u32 id = apic_read(APIC_ID); diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/asm-defns.h xen-4.17.5/xen/arch/x86/include/asm/asm-defns.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/asm-defns.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/asm-defns.h 2024-08-14 09:03:57.000000000 +0000 @@ -20,10 +20,9 @@ .byte 0x0f, 0x01, 0xdd .endm -.macro INDIRECT_BRANCH insn:req arg:req +.macro INDIRECT_CALL arg:req /* - * Create an indirect branch. insn is one of call/jmp, arg is a single - * register. + * Create an indirect call. arg is a single register. * * With no compiler support, this degrades into a plain indirect call/jmp. * With compiler support, dispatch to the correct __x86_indirect_thunk_* @@ -33,7 +32,7 @@ $done = 0 .irp reg, ax, cx, dx, bx, bp, si, di, 8, 9, 10, 11, 12, 13, 14, 15 .ifeqs "\arg", "%r\reg" - \insn __x86_indirect_thunk_r\reg + call __x86_indirect_thunk_r\reg $done = 1 .exitm .endif @@ -44,19 +43,10 @@ .endif .else - \insn *\arg + call *\arg .endif .endm -/* Convenience wrappers. */ -.macro INDIRECT_CALL arg:req - INDIRECT_BRANCH call \arg -.endm - -.macro INDIRECT_JMP arg:req - INDIRECT_BRANCH jmp \arg -.endm - #ifdef CONFIG_XEN_IBT # define ENDBR64 endbr64 #else diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/asm_defns.h xen-4.17.5/xen/arch/x86/include/asm/asm_defns.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/asm_defns.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/asm_defns.h 2024-08-14 09:03:57.000000000 +0000 @@ -81,6 +81,14 @@ #ifdef __ASSEMBLY__ +.macro BUILD_BUG_ON condstr, cond:vararg + .if \cond + .error "Condition \"\condstr\" not satisfied" + .endif +.endm +/* preprocessor macro to make error message more user friendly */ +#define BUILD_BUG_ON(cond) BUILD_BUG_ON #cond, cond + #ifdef HAVE_AS_QUOTED_SYM #define SUBSECTION_LBL(tag) \ .ifndef .L.tag; \ diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/cpufeature.h xen-4.17.5/xen/arch/x86/include/asm/cpufeature.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/cpufeature.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/cpufeature.h 2024-08-14 09:03:57.000000000 +0000 @@ -136,9 +136,11 @@ #define cpu_has_avx512_4fmaps boot_cpu_has(X86_FEATURE_AVX512_4FMAPS) #define cpu_has_avx512_vp2intersect boot_cpu_has(X86_FEATURE_AVX512_VP2INTERSECT) #define cpu_has_srbds_ctrl boot_cpu_has(X86_FEATURE_SRBDS_CTRL) +#define cpu_has_md_clear boot_cpu_has(X86_FEATURE_MD_CLEAR) #define cpu_has_rtm_always_abort boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) #define cpu_has_tsx_force_abort boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT) #define cpu_has_serialize boot_cpu_has(X86_FEATURE_SERIALIZE) +#define cpu_has_hybrid boot_cpu_has(X86_FEATURE_HYBRID) #define cpu_has_avx512_fp16 boot_cpu_has(X86_FEATURE_AVX512_FP16) #define cpu_has_arch_caps boot_cpu_has(X86_FEATURE_ARCH_CAPS) @@ -160,6 +162,8 @@ #define cpu_has_rrsba boot_cpu_has(X86_FEATURE_RRSBA) #define cpu_has_gds_ctrl boot_cpu_has(X86_FEATURE_GDS_CTRL) #define cpu_has_gds_no boot_cpu_has(X86_FEATURE_GDS_NO) +#define cpu_has_rfds_no boot_cpu_has(X86_FEATURE_RFDS_NO) +#define cpu_has_rfds_clear boot_cpu_has(X86_FEATURE_RFDS_CLEAR) /* Synthesized. */ #define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/cpufeatures.h xen-4.17.5/xen/arch/x86/include/asm/cpufeatures.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/cpufeatures.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/cpufeatures.h 2024-08-14 09:03:57.000000000 +0000 @@ -24,7 +24,7 @@ XEN_CPUFEATURE(MFENCE_RDTSC, X86_SYNTH( 9)) /* MFENCE synchronizes RDTSC */ XEN_CPUFEATURE(XEN_SMEP, X86_SYNTH(10)) /* SMEP gets used by Xen itself */ XEN_CPUFEATURE(XEN_SMAP, X86_SYNTH(11)) /* SMAP gets used by Xen itself */ -/* Bit 12 unused. */ +XEN_CPUFEATURE(SC_NO_LOCK_HARDEN, X86_SYNTH(12)) /* (Disable) Lock critical region hardening */ XEN_CPUFEATURE(IND_THUNK_LFENCE, X86_SYNTH(13)) /* Use IND_THUNK_LFENCE */ XEN_CPUFEATURE(IND_THUNK_JMP, X86_SYNTH(14)) /* Use IND_THUNK_JMP */ XEN_CPUFEATURE(SC_NO_BRANCH_HARDEN, X86_SYNTH(15)) /* (Disable) Conditional branch hardening */ @@ -52,5 +52,13 @@ #define X86_BUG_CLFLUSH_MFENCE X86_BUG( 2) /* MFENCE needed to serialise CLFLUSH */ #define X86_BUG_IBPB_NO_RET X86_BUG( 3) /* IBPB doesn't flush the RSB/RAS */ +#define X86_SPEC_NO_LFENCE_ENTRY_PV X86_BUG(16) /* (No) safety LFENCE for SPEC_CTRL_ENTRY_PV. */ +#define X86_SPEC_NO_LFENCE_ENTRY_INTR X86_BUG(17) /* (No) safety LFENCE for SPEC_CTRL_ENTRY_INTR. */ +#define X86_SPEC_NO_LFENCE_ENTRY_VMX X86_BUG(18) /* (No) safety LFENCE for SPEC_CTRL_ENTRY_VMX. */ + +#define X86_SPEC_BHB_TSX X86_BUG(19) /* Use clear_bhb_tsx for BHI mitigation. */ +#define X86_SPEC_BHB_LOOPS X86_BUG(20) /* Use clear_bhb_loops for BHI mitigation.*/ +#define X86_SPEC_BHB_LOOPS_LONG X86_BUG(21) /* Upgrade clear_bhb_loops to the "long" sequence. */ + /* Total number of capability words, inc synth and bug words. */ #define NCAPINTS (FSCAPINTS + X86_NR_SYNTH + X86_NR_BUG) /* N 32-bit words worth of info */ diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/current.h xen-4.17.5/xen/arch/x86/include/asm/current.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/current.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/current.h 2024-08-14 09:03:57.000000000 +0000 @@ -55,9 +55,9 @@ /* See asm/spec_ctrl_asm.h for usage. */ unsigned int shadow_spec_ctrl; - uint8_t xen_spec_ctrl; - uint8_t last_spec_ctrl; - uint8_t spec_ctrl_flags; + unsigned int xen_spec_ctrl; + unsigned int last_spec_ctrl; + uint8_t scf; /* SCF_* */ /* * The following field controls copying of the L4 page table of 64-bit @@ -196,10 +196,10 @@ switch_stack_and_jump(fn, "jmp %c", "i") /* The constraint may only specify non-call-clobbered registers. */ -#define reset_stack_and_jump_ind(fn) \ +#define reset_stack_and_call_ind(fn) \ ({ \ (void)((fn) == (void (*)(void))NULL); \ - switch_stack_and_jump(fn, "INDIRECT_JMP %", "b"); \ + switch_stack_and_jump(fn, "INDIRECT_CALL %", "b"); \ }) /* diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/domain.h xen-4.17.5/xen/arch/x86/include/asm/domain.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/domain.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/domain.h 2024-08-14 09:03:57.000000000 +0000 @@ -324,7 +324,7 @@ uint32_t pci_cf8; uint8_t cmos_idx; - uint8_t spec_ctrl_flags; /* See SCF_DOM_MASK */ + uint8_t scf; /* See SCF_DOM_MASK */ union { struct pv_domain pv; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/hpet.h xen-4.17.5/xen/arch/x86/include/asm/hpet.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/hpet.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/hpet.h 2024-08-14 09:03:57.000000000 +0000 @@ -89,8 +89,8 @@ * Temporarily use an HPET event counter for timer interrupt handling, * rather than using the LAPIC timer. Used for Cx state entry. */ -void cf_check hpet_broadcast_init(void); -void cf_check hpet_broadcast_resume(void); +void hpet_broadcast_init(void); +void hpet_broadcast_resume(void); void cf_check hpet_broadcast_enter(void); void cf_check hpet_broadcast_exit(void); int hpet_broadcast_is_available(void); diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/hvm/hvm.h xen-4.17.5/xen/arch/x86/include/asm/hvm/hvm.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/hvm/hvm.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/hvm/hvm.h 2024-08-14 09:03:57.000000000 +0000 @@ -583,8 +583,24 @@ ? alternative_call(hvm_funcs.get_insn_bytes, v, buf) : 0); } -static inline void hvm_invalidate_regs_fields(struct cpu_user_regs *regs) +static inline void hvm_sanitize_regs_fields(struct cpu_user_regs *regs, + bool compat) { + if ( compat ) + { + /* Clear GPR upper halves, to counteract guests playing games. */ + regs->rbp = (uint32_t)regs->rbp; + regs->rbx = (uint32_t)regs->rbx; + regs->rax = (uint32_t)regs->rax; + regs->rcx = (uint32_t)regs->rcx; + regs->rdx = (uint32_t)regs->rdx; + regs->rsi = (uint32_t)regs->rsi; + regs->rdi = (uint32_t)regs->rdi; + regs->rip = (uint32_t)regs->rip; + regs->rflags = (uint32_t)regs->rflags; + regs->rsp = (uint32_t)regs->rsp; + } + #ifndef NDEBUG regs->error_code = 0xbeef; regs->entry_vector = 0xbeef; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/hvm/vmx/vmcs.h xen-4.17.5/xen/arch/x86/include/asm/hvm/vmx/vmcs.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/hvm/vmx/vmcs.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/hvm/vmx/vmcs.h 2024-08-14 09:03:57.000000000 +0000 @@ -125,6 +125,7 @@ /* Cache of cpu execution control. */ u32 exec_control; u32 secondary_exec_control; + uint64_t tertiary_exec_control; u32 exception_bitmap; uint64_t shadow_gs; @@ -207,6 +208,7 @@ #define CPU_BASED_RDTSC_EXITING 0x00001000 #define CPU_BASED_CR3_LOAD_EXITING 0x00008000 #define CPU_BASED_CR3_STORE_EXITING 0x00010000 +#define CPU_BASED_ACTIVATE_TERTIARY_CONTROLS 0x00020000 #define CPU_BASED_CR8_LOAD_EXITING 0x00080000 #define CPU_BASED_CR8_STORE_EXITING 0x00100000 #define CPU_BASED_TPR_SHADOW 0x00200000 @@ -271,6 +273,17 @@ #define SECONDARY_EXEC_NOTIFY_VM_EXITING 0x80000000 extern u32 vmx_secondary_exec_control; +#define TERTIARY_EXEC_LOADIWKEY_EXITING BIT(0, UL) +#define TERTIARY_EXEC_ENABLE_HLAT BIT(1, UL) +#define TERTIARY_EXEC_EPT_PAGING_WRITE BIT(2, UL) +#define TERTIARY_EXEC_GUEST_PAGING_VERIFY BIT(3, UL) +#define TERTIARY_EXEC_IPI_VIRT BIT(4, UL) +#define TERTIARY_EXEC_VIRT_SPEC_CTRL BIT(7, UL) +extern uint64_t vmx_tertiary_exec_control; + +#define cpu_has_vmx_virt_spec_ctrl \ + (vmx_tertiary_exec_control & TERTIARY_EXEC_VIRT_SPEC_CTRL) + #define VMX_EPT_EXEC_ONLY_SUPPORTED 0x00000001 #define VMX_EPT_WALK_LENGTH_4_SUPPORTED 0x00000040 #define VMX_EPT_MEMORY_TYPE_UC 0x00000100 @@ -307,6 +320,8 @@ (vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_MSR_BITMAP) #define cpu_has_vmx_secondary_exec_control \ (vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) +#define cpu_has_vmx_tertiary_exec_control \ + (vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) #define cpu_has_vmx_ept \ (vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) #define cpu_has_vmx_dt_exiting \ @@ -430,6 +445,9 @@ VIRT_EXCEPTION_INFO = 0x0000202a, XSS_EXIT_BITMAP = 0x0000202c, TSC_MULTIPLIER = 0x00002032, + TERTIARY_VM_EXEC_CONTROL = 0x00002034, + SPEC_CTRL_MASK = 0x0000204a, + SPEC_CTRL_SHADOW = 0x0000204c, GUEST_PHYSICAL_ADDRESS = 0x00002400, VMCS_LINK_POINTER = 0x00002800, GUEST_IA32_DEBUGCTL = 0x00002802, diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/hvm/vmx/vmx.h xen-4.17.5/xen/arch/x86/include/asm/hvm/vmx/vmx.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/hvm/vmx/vmx.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/hvm/vmx/vmx.h 2024-08-14 09:03:57.000000000 +0000 @@ -102,6 +102,7 @@ void vmx_update_exception_bitmap(struct vcpu *v); void vmx_update_cpu_exec_control(struct vcpu *v); void vmx_update_secondary_exec_control(struct vcpu *v); +void vmx_update_tertiary_exec_control(const struct vcpu *v); #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/intel-family.h xen-4.17.5/xen/arch/x86/include/asm/intel-family.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/intel-family.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/intel-family.h 2024-08-14 09:03:57.000000000 +0000 @@ -26,6 +26,9 @@ * _G - parts with extra graphics on * _X - regular server parts * _D - micro server parts + * _N,_P - other mobile parts + * _H - premium mobile parts + * _S - other client parts * * Historical OPTDIFFs: * @@ -37,6 +40,9 @@ * their own names :-( */ +/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */ +#define INTEL_FAM6_ANY X86_MODEL_ANY + #define INTEL_FAM6_CORE_YONAH 0x0E #define INTEL_FAM6_CORE2_MEROM 0x0F @@ -93,8 +99,6 @@ #define INTEL_FAM6_ICELAKE_L 0x7E /* Sunny Cove */ #define INTEL_FAM6_ICELAKE_NNPI 0x9D /* Sunny Cove */ -#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */ - #define INTEL_FAM6_ROCKETLAKE 0xA7 /* Cypress Cove */ #define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */ @@ -102,12 +106,31 @@ #define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Golden Cove */ +#define INTEL_FAM6_EMERALDRAPIDS_X 0xCF + +#define INTEL_FAM6_GRANITERAPIDS_X 0xAD +#define INTEL_FAM6_GRANITERAPIDS_D 0xAE + +/* "Hybrid" Processors (P-Core/E-Core) */ + +#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */ + #define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */ #define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */ -#define INTEL_FAM6_RAPTORLAKE 0xB7 +#define INTEL_FAM6_RAPTORLAKE 0xB7 /* Raptor Cove / Enhanced Gracemont */ +#define INTEL_FAM6_RAPTORLAKE_P 0xBA +#define INTEL_FAM6_RAPTORLAKE_S 0xBF + +#define INTEL_FAM6_METEORLAKE 0xAC +#define INTEL_FAM6_METEORLAKE_L 0xAA + +#define INTEL_FAM6_ARROWLAKE_H 0xC5 +#define INTEL_FAM6_ARROWLAKE 0xC6 + +#define INTEL_FAM6_LUNARLAKE_M 0xBD -/* "Small Core" Processors (Atom) */ +/* "Small Core" Processors (Atom/E-Core) */ #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ #define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */ @@ -134,6 +157,13 @@ #define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */ #define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */ +#define INTEL_FAM6_ATOM_GRACEMONT 0xBE /* Alderlake N */ + +#define INTEL_FAM6_ATOM_CRESTMONT_X 0xAF /* Sierra Forest */ +#define INTEL_FAM6_ATOM_CRESTMONT 0xB6 /* Grand Ridge */ + +#define INTEL_FAM6_ATOM_DARKMONT_X 0xDD /* Clearwater Forest */ + /* Xeon Phi */ #define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/io.h xen-4.17.5/xen/arch/x86/include/asm/io.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/io.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/io.h 2024-08-14 09:03:57.000000000 +0000 @@ -47,10 +47,14 @@ __OUT(w,"w",short) __OUT(l,,int) -/* Function pointer used to handle platform specific I/O port emulation. */ +/* + * Boolean indicator and function used to handle platform specific I/O port + * emulation. + */ #define IOEMUL_QUIRK_STUB_BYTES 9 +extern bool ioemul_handle_quirk; struct cpu_user_regs; -extern unsigned int (*ioemul_handle_quirk)( - u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs); +unsigned int ioemul_handle_proliant_quirk( + uint8_t opcode, char *io_emul_stub, const struct cpu_user_regs *regs); #endif diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/irq.h xen-4.17.5/xen/arch/x86/include/asm/irq.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/irq.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/irq.h 2024-08-14 09:03:57.000000000 +0000 @@ -70,27 +70,6 @@ extern int opt_irq_vector_map; -/* - * Per-cpu current frame pointer - the location of the last exception frame on - * the stack - */ -DECLARE_PER_CPU(struct cpu_user_regs *, __irq_regs); - -static inline struct cpu_user_regs *get_irq_regs(void) -{ - return this_cpu(__irq_regs); -} - -static inline struct cpu_user_regs *set_irq_regs(struct cpu_user_regs *new_regs) -{ - struct cpu_user_regs *old_regs, **pp_regs = &this_cpu(__irq_regs); - - old_regs = *pp_regs; - *pp_regs = new_regs; - return old_regs; -} - - #define platform_legacy_irq(irq) ((irq) < 16) void cf_check event_check_interrupt(struct cpu_user_regs *regs); @@ -157,7 +136,7 @@ int map_domain_emuirq_pirq(struct domain *d, int pirq, int irq); int unmap_domain_pirq_emuirq(struct domain *d, int pirq); -/* Reset irq affinities to match the given CPU mask. */ +/* Evacuate interrupts assigned to CPUs not present in the input CPU mask. */ void fixup_irqs(const cpumask_t *mask, bool verbose); void fixup_eoi(void); @@ -178,6 +157,7 @@ extern struct irq_desc *irq_desc; +/* Not speculation safe, only used for AP bringup. */ void lock_vector_lock(void); void unlock_vector_lock(void); diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/mach-generic/mach_apic.h xen-4.17.5/xen/arch/x86/include/asm/mach-generic/mach_apic.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/mach-generic/mach_apic.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/mach-generic/mach_apic.h 2024-08-14 09:03:57.000000000 +0000 @@ -13,7 +13,7 @@ #define INT_DELIVERY_MODE (genapic.int_delivery_mode) #define INT_DEST_MODE (genapic.int_dest_mode) #define TARGET_CPUS ((const typeof(cpu_online_map) *)&cpu_online_map) -#define init_apic_ldr (genapic.init_apic_ldr) +#define init_apic_ldr() alternative_vcall(genapic.init_apic_ldr) #define cpu_mask_to_apicid(mask) ({ \ /* \ * There are a number of places where the address of a local variable \ diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/mm.h xen-4.17.5/xen/arch/x86/include/asm/mm.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/mm.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/mm.h 2024-08-14 09:03:57.000000000 +0000 @@ -393,7 +393,9 @@ * The use of PGT_locked in mem_sharing does not collide, since mem_sharing is * only supported for hvm guests, which do not have PV PTEs updated. */ -int page_lock(struct page_info *page); +int page_lock_unsafe(struct page_info *page); +#define page_lock(pg) lock_evaluate_nospec(page_lock_unsafe(pg)) + void page_unlock(struct page_info *page); void put_page_type(struct page_info *page); diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/msr-index.h xen-4.17.5/xen/arch/x86/include/asm/msr-index.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/msr-index.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/msr-index.h 2024-08-14 09:03:57.000000000 +0000 @@ -88,6 +88,8 @@ #define ARCH_CAPS_PBRSB_NO (_AC(1, ULL) << 24) #define ARCH_CAPS_GDS_CTRL (_AC(1, ULL) << 25) #define ARCH_CAPS_GDS_NO (_AC(1, ULL) << 26) +#define ARCH_CAPS_RFDS_NO (_AC(1, ULL) << 27) +#define ARCH_CAPS_RFDS_CLEAR (_AC(1, ULL) << 28) #define MSR_FLUSH_CMD 0x0000010b #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) @@ -325,6 +327,7 @@ #define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x48f #define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x490 #define MSR_IA32_VMX_VMFUNC 0x491 +#define MSR_IA32_VMX_PROCBASED_CTLS3 0x492 /* K7/K8 MSRs. Not complete. See the architecture manual for a more complete list. */ diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/msr.h xen-4.17.5/xen/arch/x86/include/asm/msr.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/msr.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/msr.h 2024-08-14 09:03:57.000000000 +0000 @@ -290,8 +290,11 @@ * For PV guests, this holds the guest kernel value. It is accessed on * every entry/exit path. * - * For VT-x guests, the guest value is held in the MSR guest load/save - * list. + * For VT-x guests, one of two situations exist: + * + * - If hardware supports virtualized MSR_SPEC_CTRL, it is active by + * default and the guest value lives in the VMCS. + * - Otherwise, the guest value is held in the MSR load/save list. * * For SVM, the guest value lives in the VMCB, and hardware saves/restores * the host value automatically. However, guests run with the OR of the diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/nospec.h xen-4.17.5/xen/arch/x86/include/asm/nospec.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/nospec.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/nospec.h 2024-08-14 09:03:57.000000000 +0000 @@ -38,6 +38,32 @@ barrier_nospec_true(); } +static always_inline void arch_block_lock_speculation(void) +{ + alternative("lfence", "", X86_FEATURE_SC_NO_LOCK_HARDEN); +} + +/* Allow to insert a read memory barrier into conditionals */ +static always_inline bool barrier_lock_true(void) +{ + alternative("lfence #nospec-true", "", X86_FEATURE_SC_NO_LOCK_HARDEN); + return true; +} + +static always_inline bool barrier_lock_false(void) +{ + alternative("lfence #nospec-false", "", X86_FEATURE_SC_NO_LOCK_HARDEN); + return false; +} + +static always_inline bool arch_lock_evaluate_nospec(bool condition) +{ + if ( condition ) + return barrier_lock_true(); + else + return barrier_lock_false(); +} + #endif /* _ASM_X86_NOSPEC_H */ /* diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/paging.h xen-4.17.5/xen/arch/x86/include/asm/paging.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/paging.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/paging.h 2024-08-14 09:03:57.000000000 +0000 @@ -138,8 +138,7 @@ paddr_t ga, uint32_t *pfec, unsigned int *page_order); #endif - pagetable_t (*update_cr3 )(struct vcpu *v, bool do_locking, - bool noflush); + pagetable_t (*update_cr3 )(struct vcpu *v, bool noflush); void (*update_paging_modes )(struct vcpu *v); bool (*flush_tlb )(const unsigned long *vcpu_bitmap); @@ -312,7 +311,7 @@ * as the value to load into the host CR3 to schedule this vcpu */ static inline pagetable_t paging_update_cr3(struct vcpu *v, bool noflush) { - return paging_get_hostmode(v)->update_cr3(v, 1, noflush); + return paging_get_hostmode(v)->update_cr3(v, noflush); } /* Update all the things that are derived from the guest's CR0/CR3/CR4. diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/setup.h xen-4.17.5/xen/arch/x86/include/asm/setup.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/setup.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/setup.h 2024-08-14 09:03:57.000000000 +0000 @@ -48,6 +48,8 @@ void microcode_grab_module( unsigned long *, const multiboot_info_t *); +int cf_check stub_selftest(void); + extern uint8_t kbd_shift_flags; #ifdef NDEBUG diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/spec_ctrl.h xen-4.17.5/xen/arch/x86/include/asm/spec_ctrl.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/spec_ctrl.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/spec_ctrl.h 2024-08-14 09:03:57.000000000 +0000 @@ -21,10 +21,10 @@ #define __X86_SPEC_CTRL_H__ /* - * Encoding of: - * cpuinfo.spec_ctrl_flags - * default_spec_ctrl_flags - * domain.spec_ctrl_flags + * Encoding of Xen's speculation control flags in: + * cpuinfo.scf + * default_scf + * domain.scf * * Live settings are in the top-of-stack block, because they need to be * accessable when XPTI is active. Some settings are fixed from boot, some @@ -36,6 +36,7 @@ #define SCF_verw (1 << 3) #define SCF_ist_ibpb (1 << 4) #define SCF_entry_ibpb (1 << 5) +#define SCF_entry_bhb (1 << 6) /* * The IST paths (NMI/#MC) can interrupt any arbitrary context. Some @@ -54,7 +55,7 @@ * Some speculative protections are per-domain. These settings are merged * into the top-of-stack block in the context switch path. */ -#define SCF_DOM_MASK (SCF_verw | SCF_entry_ibpb) +#define SCF_DOM_MASK (SCF_verw | SCF_entry_ibpb | SCF_entry_bhb) #ifndef __ASSEMBLY__ @@ -89,12 +90,13 @@ extern int8_t opt_ibpb_ctxt_switch; extern bool opt_ssbd; +extern int8_t opt_bhi_dis_s; extern int8_t opt_eager_fpu; extern int8_t opt_l1d_flush; extern bool bsp_delay_spec_ctrl; -extern uint8_t default_xen_spec_ctrl; -extern uint8_t default_spec_ctrl_flags; +extern unsigned int default_xen_spec_ctrl; +extern uint8_t default_scf; extern int8_t opt_xpti_hwdom, opt_xpti_domu; @@ -114,7 +116,7 @@ info->shadow_spec_ctrl = 0; info->xen_spec_ctrl = default_xen_spec_ctrl; - info->spec_ctrl_flags = default_spec_ctrl_flags; + info->scf = default_scf; /* * For least latency, the VERW selector should be a writeable data @@ -138,7 +140,7 @@ */ info->shadow_spec_ctrl = val; barrier(); - info->spec_ctrl_flags |= SCF_use_shadow; + info->scf |= SCF_use_shadow; barrier(); alternative_input("", "wrmsr", X86_FEATURE_SC_MSR_IDLE, "a" (val), "c" (MSR_SPEC_CTRL), "d" (0)); @@ -187,7 +189,7 @@ * Disable shadowing before updating the MSR. There are no SMP issues * here; only local processor ordering concerns. */ - info->spec_ctrl_flags &= ~SCF_use_shadow; + info->scf &= ~SCF_use_shadow; barrier(); alternative_input("", "wrmsr", X86_FEATURE_SC_MSR_IDLE, "a" (val), "c" (MSR_SPEC_CTRL), "d" (0)); diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/spec_ctrl_asm.h xen-4.17.5/xen/arch/x86/include/asm/spec_ctrl_asm.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/spec_ctrl_asm.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/spec_ctrl_asm.h 2024-08-14 09:03:57.000000000 +0000 @@ -51,7 +51,7 @@ * shadowing logic. * * Factor 2 is harder. We maintain a shadow_spec_ctrl value, and a use_shadow - * boolean in the per cpu spec_ctrl_flags. The synchronous use is: + * boolean in the per cpu scf. The synchronous use is: * * 1) Store guest value in shadow_spec_ctrl * 2) Set the use_shadow boolean @@ -87,33 +87,21 @@ * - SPEC_CTRL_EXIT_TO_{SVM,VMX} */ -.macro DO_SPEC_CTRL_COND_IBPB maybexen:req +.macro DO_COND_IBPB /* - * Requires %rsp=regs (also cpuinfo if !maybexen) - * Requires %r14=stack_end (if maybexen), %rdx=0 - * Clobbers %rax, %rcx, %rdx + * Requires %rbx=SCF, %rdx=0 + * Clobbers %rax, %rcx * - * Conditionally issue IBPB if SCF_entry_ibpb is active. In the maybexen - * case, we can safely look at UREGS_cs to skip taking the hit when - * interrupting Xen. + * Conditionally issue IBPB if SCF_entry_ibpb is active. */ - .if \maybexen - testb $SCF_entry_ibpb, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) - jz .L\@_skip - testb $3, UREGS_cs(%rsp) - .else - testb $SCF_entry_ibpb, CPUINFO_xen_spec_ctrl(%rsp) - .endif + testb $SCF_entry_ibpb, %bl jz .L\@_skip mov $MSR_PRED_CMD, %ecx mov $PRED_CMD_IBPB, %eax wrmsr - jmp .L\@_done .L\@_skip: - lfence -.L\@_done: .endm .macro DO_OVERWRITE_RSB tmp=rax xu @@ -164,16 +152,30 @@ #endif .endm -.macro DO_SPEC_CTRL_COND_VERW /* - * Requires %rsp=cpuinfo + * Helper to improve the readibility of stack dispacements with %rsp in + * unusual positions. Both @field and @top_of_stack should be constants from + * the same object. @top_of_stack should be where %rsp is currently pointing. + */ +#define STK_REL(field, top_of_stk) ((field) - (top_of_stk)) + +.macro SPEC_CTRL_COND_VERW \ + scf=STK_REL(CPUINFO_scf, CPUINFO_error_code), \ + sel=STK_REL(CPUINFO_verw_sel, CPUINFO_error_code) +/* + * Requires \scf and \sel as %rsp-relative expressions + * Clobbers eflags + * + * VERW needs to run after guest GPRs have been restored, where only %rsp is + * good to use. Default to expecting %rsp pointing at CPUINFO_error_code. + * Contexts where this is not true must provide an alternative \scf and \sel. * * Issue a VERW for its flushing side effect, if indicated. This is a Spectre * v1 gadget, but the IRET/VMEntry is serialising. */ - testb $SCF_verw, CPUINFO_spec_ctrl_flags(%rsp) + testb $SCF_verw, \scf(%rsp) jz .L\@_verw_skip - verw CPUINFO_verw_sel(%rsp) + verw \sel(%rsp) .L\@_verw_skip: .endm @@ -214,11 +216,11 @@ testb $3, UREGS_cs(%rsp) setnz %al not %eax - and %al, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) - movzbl STACK_CPUINFO_FIELD(xen_spec_ctrl)(%r14), %eax + and %al, STACK_CPUINFO_FIELD(scf)(%r14) + mov STACK_CPUINFO_FIELD(xen_spec_ctrl)(%r14), %eax .else - andb $~SCF_use_shadow, CPUINFO_spec_ctrl_flags(%rsp) - movzbl CPUINFO_xen_spec_ctrl(%rsp), %eax + andb $~SCF_use_shadow, CPUINFO_scf(%rsp) + mov CPUINFO_xen_spec_ctrl(%rsp), %eax .endif wrmsr @@ -236,7 +238,7 @@ mov %eax, CPUINFO_shadow_spec_ctrl(%rsp) /* Set SPEC_CTRL shadowing *before* loading the guest value. */ - orb $SCF_use_shadow, CPUINFO_spec_ctrl_flags(%rsp) + orb $SCF_use_shadow, CPUINFO_scf(%rsp) mov $MSR_SPEC_CTRL, %ecx xor %edx, %edx @@ -249,35 +251,97 @@ */ .macro SPEC_CTRL_ENTRY_FROM_PV /* - * Requires %rsp=regs/cpuinfo, %rdx=0 - * Clobbers %rax, %rcx, %rdx + * Requires %rsp=regs/cpuinfo, %r14=stack_end, %rdx=0 + * Clobbers %rax, %rbx, %rcx, %rdx */ - ALTERNATIVE "", __stringify(DO_SPEC_CTRL_COND_IBPB maybexen=0), \ - X86_FEATURE_IBPB_ENTRY_PV + movzbl STACK_CPUINFO_FIELD(scf)(%r14), %ebx + + /* + * For all safety notes, 32bit PV guest kernels run in Ring 1 and are + * therefore supervisor (== Xen) in the architecture. As a result, most + * hardware isolation techniques do not work. + */ + + /* + * IBPB is to mitigate BTC/SRSO on AMD/Hygon parts, in particular making + * type-confused RETs safe to use. This is not needed on Zen5 and later + * parts when SRSO_U/S_NO is enumerated. + */ + ALTERNATIVE "", DO_COND_IBPB, X86_FEATURE_IBPB_ENTRY_PV + /* + * RSB stuffing is to prevent RET predictions following guest entries. + * This is not needed if SMEP is active and the RSB is full-width. + */ ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV + /* + * Only used on Intel parts. Restore Xen's MSR_SPEC_CTRL setting. The + * guest can't change it's value behind Xen's back. For Legacy IBRS, this + * flushes/inhibits indirect predictions and does not flush the RSB. For + * eIBRS, this prevents CALLs/JMPs using predictions learnt at a lower + * predictor mode, and it flushes the RSB. + */ ALTERNATIVE "", __stringify(DO_SPEC_CTRL_ENTRY maybexen=0), \ X86_FEATURE_SC_MSR_PV + + /* + * Clear the BHB to mitigate BHI. Used on eIBRS parts, and uses RETs + * itself so must be after we've perfomed all the RET-safety we can. + */ + testb $SCF_entry_bhb, %bl + jz .L\@_skip_bhb + ALTERNATIVE_2 "", \ + "call clear_bhb_loops", X86_SPEC_BHB_LOOPS, \ + "call clear_bhb_tsx", X86_SPEC_BHB_TSX +.L\@_skip_bhb: + + ALTERNATIVE "lfence", "", X86_SPEC_NO_LFENCE_ENTRY_PV .endm /* * Used after an exception or maskable interrupt, hitting Xen or PV context. - * There will either be a guest speculation context, or (barring fatal - * exceptions) a well-formed Xen speculation context. + * There will either be a guest speculation context, or a well-formed Xen + * speculation context, with the exception of one case. IRET #GP handling may + * have a guest choice of MSR_SPEC_CTRL. + * + * Therefore, we can skip the flush/barrier-like protections when hitting Xen, + * but we must still run the mode-based protections. */ .macro SPEC_CTRL_ENTRY_FROM_INTR /* * Requires %rsp=regs, %r14=stack_end, %rdx=0 - * Clobbers %rax, %rcx, %rdx + * Clobbers %rax, %rbx, %rcx, %rdx */ - ALTERNATIVE "", __stringify(DO_SPEC_CTRL_COND_IBPB maybexen=1), \ - X86_FEATURE_IBPB_ENTRY_PV + movzbl STACK_CPUINFO_FIELD(scf)(%r14), %ebx + + /* + * All safety notes the same as SPEC_CTRL_ENTRY_FROM_PV, although there is + * a conditional jump skipping some actions when interrupting Xen. + * + * On Intel parts, the IRET #GP path ends up here with the guest's choice + * of MSR_SPEC_CTRL. + */ + + testb $3, UREGS_cs(%rsp) + jz .L\@_skip + + ALTERNATIVE "", DO_COND_IBPB, X86_FEATURE_IBPB_ENTRY_PV ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV +.L\@_skip: ALTERNATIVE "", __stringify(DO_SPEC_CTRL_ENTRY maybexen=1), \ X86_FEATURE_SC_MSR_PV + + testb $SCF_entry_bhb, %bl + jz .L\@_skip_bhb + ALTERNATIVE_2 "", \ + "call clear_bhb_loops", X86_SPEC_BHB_LOOPS, \ + "call clear_bhb_tsx", X86_SPEC_BHB_TSX +.L\@_skip_bhb: + + ALTERNATIVE "lfence", "", X86_SPEC_NO_LFENCE_ENTRY_INTR .endm /* @@ -291,8 +355,6 @@ */ ALTERNATIVE "", DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV - DO_SPEC_CTRL_COND_VERW - ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV .endm @@ -311,13 +373,26 @@ * Clobbers %rax, %rbx, %rcx, %rdx * * This is logical merge of: - * DO_SPEC_CTRL_COND_IBPB maybexen=0 + * DO_COND_IBPB * DO_OVERWRITE_RSB * DO_SPEC_CTRL_ENTRY maybexen=1 * but with conditionals rather than alternatives. */ - movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %ebx + movzbl STACK_CPUINFO_FIELD(scf)(%r14), %ebx + + /* + * For all safety notes, 32bit PV guest kernels run in Ring 1 and are + * therefore supervisor (== Xen) in the architecture. As a result, most + * hardware isolation techniques do not work. + */ + /* + * IBPB is to mitigate BTC/SRSO on AMD/Hygon parts, in particular making + * type-confused RETs safe to use. This is not needed on Zen5 and later + * parts when SRSO_U/S_NO is enumerated. The SVM path takes care of + * Host/Guest interactions prior to clearing GIF, and it's not used on the + * VMX path. + */ test $SCF_ist_ibpb, %bl jz .L\@_skip_ibpb @@ -327,6 +402,12 @@ .L\@_skip_ibpb: + /* + * RSB stuffing is to prevent RET predictions following guest entries. + * SCF_ist_rsb is active if either PV or HVM protections are needed. The + * VMX path cannot guarantee to make the RSB safe ahead of taking an IST + * vector. + */ test $SCF_ist_rsb, %bl jz .L\@_skip_rsb @@ -334,6 +415,16 @@ .L\@_skip_rsb: + /* + * Only used on Intel parts. Restore Xen's MSR_SPEC_CTRL setting. PV + * guests can't change their value behind Xen's back. HVM guests have + * their value stored in the MSR load/save list. For Legacy IBRS, this + * flushes/inhibits indirect predictions and does not flush the RSB. For + * eIBRS, this prevents CALLs/JMPs using predictions learnt at a lower + * predictor mode, and it flushes the RSB. On eIBRS parts that also + * suffer from PBRSB, the prior RSB stuffing suffices to make the RSB + * safe. + */ test $SCF_ist_sc_msr, %bl jz .L\@_skip_msr_spec_ctrl @@ -341,25 +432,28 @@ testb $3, UREGS_cs(%rsp) setnz %al not %eax - and %al, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) + and %al, STACK_CPUINFO_FIELD(scf)(%r14) /* Load Xen's intended value. */ mov $MSR_SPEC_CTRL, %ecx - movzbl STACK_CPUINFO_FIELD(xen_spec_ctrl)(%r14), %eax + mov STACK_CPUINFO_FIELD(xen_spec_ctrl)(%r14), %eax wrmsr - /* Opencoded UNLIKELY_START() with no condition. */ -UNLIKELY_DISPATCH_LABEL(\@_serialise): - .subsection 1 - /* - * In the case that we might need to set SPEC_CTRL.IBRS for safety, we - * need to ensure that an attacker can't poison the `jz .L\@_skip_wrmsr` - * to speculate around the WRMSR. As a result, we need a dispatch - * serialising instruction in the else clause. - */ .L\@_skip_msr_spec_ctrl: + + /* + * Clear the BHB to mitigate BHI. Used on eIBRS parts, and uses RETs + * itself so must be after we've perfomed all the RET-safety we can. + */ + testb $SCF_entry_bhb, %bl + jz .L\@_skip_bhb + + ALTERNATIVE_2 "", \ + "call clear_bhb_loops", X86_SPEC_BHB_LOOPS, \ + "call clear_bhb_tsx", X86_SPEC_BHB_TSX +.L\@_skip_bhb: + lfence - UNLIKELY_END(\@_serialise) .endm /* @@ -372,10 +466,10 @@ */ .macro SPEC_CTRL_EXIT_TO_XEN /* - * Requires %r12=ist_exit, %r14=stack_end + * Requires %r12=ist_exit, %r14=stack_end, %rsp=regs * Clobbers %rax, %rbx, %rcx, %rdx */ - movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %ebx + movzbl STACK_CPUINFO_FIELD(scf)(%r14), %ebx testb $SCF_ist_sc_msr, %bl jz .L\@_skip_sc_msr @@ -400,11 +494,18 @@ test %r12, %r12 jz .L\@_skip_ist_exit - /* Logically DO_SPEC_CTRL_COND_VERW but without the %rsp=cpuinfo dependency */ - testb $SCF_verw, %bl - jz .L\@_skip_verw - verw STACK_CPUINFO_FIELD(verw_sel)(%r14) -.L\@_skip_verw: + /* + * Stash SCF and verw_sel above eflags in the case of an IST_exit. The + * VERW logic needs to run after guest GPRs have been restored; i.e. where + * we cannot use %r12 or %r14 for the purposes they have here. + * + * When the CPU pushed this exception frame, it zero-extended eflags. + * Therefore it is safe for the VERW logic to look at the stashed SCF + * outside of the ist_exit condition. Also, this stashing won't influence + * any other restore_all_guest() paths. + */ + or $(__HYPERVISOR_DS32 << 16), %ebx + mov %ebx, UREGS_eflags + 4(%rsp) /* EFRAME_shadow_scf/sel */ ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/uaccess.h xen-4.17.5/xen/arch/x86/include/asm/uaccess.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/include/asm/uaccess.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/include/asm/uaccess.h 2024-08-14 09:03:57.000000000 +0000 @@ -421,7 +421,8 @@ unsigned long raw; }; -extern unsigned long search_exception_table(const struct cpu_user_regs *regs); +extern unsigned long search_exception_table(const struct cpu_user_regs *regs, + unsigned long *stub_ra); extern void sort_exception_tables(void); extern void sort_exception_table(struct exception_table_entry *start, const struct exception_table_entry *stop); diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/io_apic.c xen-4.17.5/xen/arch/x86/io_apic.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/io_apic.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/io_apic.c 2024-08-14 09:03:57.000000000 +0000 @@ -1692,7 +1692,8 @@ !io_apic_level_ack_pending(desc->irq)) move_masked_irq(desc); - if ( !(v & (1 << (i & 0x1f))) ) { + if ( !(v & (1U << (i & 0x1f))) ) + { spin_lock(&ioapic_lock); __edge_IO_APIC_irq(desc->irq); __level_IO_APIC_irq(desc->irq); @@ -1756,7 +1757,8 @@ !io_apic_level_ack_pending(desc->irq) ) move_native_irq(desc); - if (!(v & (1 << (i & 0x1f)))) { + if ( !(v & (1U << (i & 0x1f))) ) + { spin_lock(&ioapic_lock); __mask_IO_APIC_irq(desc->irq); __edge_IO_APIC_irq(desc->irq); @@ -2659,18 +2661,21 @@ nr_irqs_gsi, nr_irqs - nr_irqs_gsi); } -unsigned int arch_hwdom_irqs(domid_t domid) +unsigned int __hwdom_init arch_hwdom_irqs(const struct domain *d) { unsigned int n = fls(num_present_cpus()); + /* Bounding by the domain pirq EOI bitmap capacity. */ + const unsigned int max_irqs = min_t(unsigned int, nr_irqs, + PAGE_SIZE * BITS_PER_BYTE); - if ( !domid ) - n = min(n, dom0_max_vcpus()); - n = min(nr_irqs_gsi + n * NR_DYNAMIC_VECTORS, nr_irqs); + if ( is_system_domain(d) ) + return max_irqs; - /* Bounded by the domain pirq eoi bitmap gfn. */ - n = min_t(unsigned int, n, PAGE_SIZE * BITS_PER_BYTE); + if ( !d->domain_id ) + n = min(n, dom0_max_vcpus()); + n = min(nr_irqs_gsi + n * NR_DYNAMIC_VECTORS, max_irqs); - printk("Dom%d has maximum %u PIRQs\n", domid, n); + printk("%pd has maximum %u PIRQs\n", d, n); return n; } diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/ioport_emulate.c xen-4.17.5/xen/arch/x86/ioport_emulate.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/ioport_emulate.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/ioport_emulate.c 2024-08-14 09:03:57.000000000 +0000 @@ -8,11 +8,10 @@ #include #include -unsigned int (*__read_mostly ioemul_handle_quirk)( - uint8_t opcode, char *io_emul_stub, struct cpu_user_regs *regs); +bool __ro_after_init ioemul_handle_quirk; -static unsigned int cf_check ioemul_handle_proliant_quirk( - u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs) +unsigned int ioemul_handle_proliant_quirk( + uint8_t opcode, char *io_emul_stub, const struct cpu_user_regs *regs) { static const char stub[] = { 0x9c, /* pushf */ @@ -103,7 +102,7 @@ static int __init cf_check ioport_quirks_init(void) { if ( dmi_check_system(ioport_quirks_tbl) ) - ioemul_handle_quirk = ioemul_handle_proliant_quirk; + ioemul_handle_quirk = true; return 0; } diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/irq.c xen-4.17.5/xen/arch/x86/irq.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/irq.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/irq.c 2024-08-14 09:03:57.000000000 +0000 @@ -53,8 +53,6 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq); -DEFINE_PER_CPU(struct cpu_user_regs *, __irq_regs); - static LIST_HEAD(irq_ratelimit_list); static DEFINE_SPINLOCK(irq_ratelimit_lock); static struct timer irq_ratelimit_timer; @@ -555,7 +553,58 @@ } if ( desc->arch.move_in_progress || desc->arch.move_cleanup_count ) - return -EAGAIN; + { + /* + * If the current destination is online refuse to shuffle. Retry after + * the in-progress movement has finished. + */ + if ( cpumask_intersects(desc->arch.cpu_mask, &cpu_online_map) ) + return -EAGAIN; + + /* + * Due to the logic in fixup_irqs() that clears offlined CPUs from + * ->arch.old_cpu_mask it shouldn't be possible to get here with + * ->arch.move_{in_progress,cleanup_count} set and no online CPUs in + * ->arch.old_cpu_mask. + */ + ASSERT(valid_irq_vector(desc->arch.old_vector)); + ASSERT(cpumask_intersects(desc->arch.old_cpu_mask, &cpu_online_map)); + + if ( cpumask_intersects(desc->arch.old_cpu_mask, mask) ) + { + /* + * Fallback to the old destination if moving is in progress and the + * current destination is to be offlined. This is only possible if + * the CPUs in old_cpu_mask intersect with the affinity mask passed + * in the 'mask' parameter. + */ + desc->arch.vector = desc->arch.old_vector; + cpumask_and(desc->arch.cpu_mask, desc->arch.old_cpu_mask, mask); + + /* Undo any possibly done cleanup. */ + for_each_cpu(cpu, desc->arch.cpu_mask) + per_cpu(vector_irq, cpu)[desc->arch.vector] = irq; + + /* Cancel the pending move and release the current vector. */ + desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED; + cpumask_clear(desc->arch.old_cpu_mask); + desc->arch.move_in_progress = 0; + desc->arch.move_cleanup_count = 0; + if ( desc->arch.used_vectors ) + { + ASSERT(test_bit(old_vector, desc->arch.used_vectors)); + clear_bit(old_vector, desc->arch.used_vectors); + } + + return 0; + } + + /* + * There's an interrupt movement in progress but the destination(s) in + * ->arch.old_cpu_mask are not suitable given the 'mask' parameter, go + * through the full logic to find a new vector in a suitable CPU. + */ + } err = -ENOSPC; @@ -611,7 +660,22 @@ current_vector = vector; current_offset = offset; - if ( valid_irq_vector(old_vector) ) + if ( desc->arch.move_in_progress || desc->arch.move_cleanup_count ) + { + ASSERT(!cpumask_intersects(desc->arch.cpu_mask, &cpu_online_map)); + /* + * Special case when evacuating an interrupt from a CPU to be + * offlined and the interrupt was already in the process of being + * moved. Leave ->arch.old_{vector,cpu_mask} as-is and just + * replace ->arch.{cpu_mask,vector} with the new destination. + * Cleanup will be done normally for the old fields, just release + * the current vector here. + */ + if ( desc->arch.used_vectors && + !test_and_clear_bit(old_vector, desc->arch.used_vectors) ) + ASSERT_UNREACHABLE(); + } + else if ( valid_irq_vector(old_vector) ) { cpumask_and(desc->arch.old_cpu_mask, desc->arch.cpu_mask, &cpu_online_map); @@ -1349,6 +1413,7 @@ if ( radix_tree_delete(&d->pirq_tree, pirq->pirq) != pirq ) BUG(); + free_pirq_struct(pirq); } /* Flush all ready EOIs from the top of this CPU's pending-EOI stack. */ @@ -2222,6 +2287,7 @@ set_domain_irq_pirq(d, irq, info); spin_unlock_irqrestore(&desc->lock, flags); + desc = NULL; info = NULL; irq = create_irq(NUMA_NO_NODE, true); @@ -2257,7 +2323,9 @@ if ( ret ) { - spin_unlock_irqrestore(&desc->lock, flags); + if ( desc ) + spin_unlock_irqrestore(&desc->lock, flags); + pci_disable_msi(msi_desc); if ( nr ) { @@ -2531,7 +2599,7 @@ } __initcall(setup_dump_irqs); -/* Reset irq affinities to match the given CPU mask. */ +/* Evacuate interrupts assigned to CPUs not present in the input CPU mask. */ void fixup_irqs(const cpumask_t *mask, bool verbose) { unsigned int irq; @@ -2540,8 +2608,8 @@ for ( irq = 0; irq < nr_irqs; irq++ ) { - bool break_affinity = false, set_affinity = true; - unsigned int vector; + bool break_affinity = false, set_affinity = true, check_irr = false; + unsigned int vector, cpu = smp_processor_id(); cpumask_t *affinity = this_cpu(scratch_cpumask); if ( irq == 2 ) @@ -2555,19 +2623,15 @@ vector = irq_to_vector(irq); if ( vector >= FIRST_HIPRIORITY_VECTOR && - vector <= LAST_HIPRIORITY_VECTOR ) + vector <= LAST_HIPRIORITY_VECTOR && + desc->handler == &no_irq_type ) { - cpumask_and(desc->arch.cpu_mask, desc->arch.cpu_mask, mask); - /* * This can in particular happen when parking secondary threads * during boot and when the serial console wants to use a PCI IRQ. */ - if ( desc->handler == &no_irq_type ) - { - spin_unlock(&desc->lock); - continue; - } + spin_unlock(&desc->lock); + continue; } if ( desc->arch.move_cleanup_count ) @@ -2578,39 +2642,71 @@ desc->arch.move_cleanup_count -= cpumask_weight(affinity); if ( !desc->arch.move_cleanup_count ) release_old_vec(desc); + else + /* + * Adjust old_cpu_mask to account for the offline CPUs, + * otherwise further calls to fixup_irqs() could subtract those + * again and possibly underflow the counter. + */ + cpumask_andnot(desc->arch.old_cpu_mask, desc->arch.old_cpu_mask, + affinity); } - if ( !desc->action || cpumask_subset(desc->affinity, mask) ) + if ( desc->arch.move_in_progress && + /* + * Only attempt to adjust the mask if the current CPU is going + * offline, otherwise the whole system is going down and leaving + * stale data in the masks is fine. + */ + !cpu_online(cpu) && + cpumask_test_cpu(cpu, desc->arch.old_cpu_mask) ) { - spin_unlock(&desc->lock); - continue; + /* + * This to be offlined CPU was the target of an interrupt that's + * been moved, and the new destination target hasn't yet + * acknowledged any interrupt from it. + * + * We know the interrupt is configured to target the new CPU at + * this point, so we can check IRR for any pending vectors and + * forward them to the new destination. + * + * Note that for the other case of an interrupt movement being in + * progress (move_cleanup_count being non-zero) we know the new + * destination has already acked at least one interrupt from this + * source, and hence there's no need to forward any stale + * interrupts. + */ + if ( apic_irr_read(desc->arch.old_vector) ) + send_IPI_mask(cpumask_of(cpumask_any(desc->arch.cpu_mask)), + desc->arch.vector); + + /* + * This CPU is going offline, remove it from ->arch.old_cpu_mask + * and possibly release the old vector if the old mask becomes + * empty. + * + * Note cleaning ->arch.old_cpu_mask is required if the CPU is + * brought offline and then online again, as when re-onlined the + * per-cpu vector table will no longer have ->arch.old_vector + * setup, and hence ->arch.old_cpu_mask would be stale. + */ + cpumask_clear_cpu(cpu, desc->arch.old_cpu_mask); + if ( cpumask_empty(desc->arch.old_cpu_mask) ) + { + desc->arch.move_in_progress = 0; + release_old_vec(desc); + } } /* - * In order for the affinity adjustment below to be successful, we - * need _assign_irq_vector() to succeed. This in particular means - * clearing desc->arch.move_in_progress if this would otherwise - * prevent the function from succeeding. Since there's no way for the - * flag to get cleared anymore when there's no possible destination - * left (the only possibility then would be the IRQs enabled window - * after this loop), there's then also no race with us doing it here. - * - * Therefore the logic here and there need to remain in sync. + * Avoid shuffling the interrupt around as long as current target CPUs + * are a subset of the input mask. What fixup_irqs() cares about is + * evacuating interrupts from CPUs not in the input mask. */ - if ( desc->arch.move_in_progress && - !cpumask_intersects(mask, desc->arch.cpu_mask) ) + if ( !desc->action || cpumask_subset(desc->arch.cpu_mask, mask) ) { - unsigned int cpu; - - cpumask_and(affinity, desc->arch.old_cpu_mask, &cpu_online_map); - - spin_lock(&vector_lock); - for_each_cpu(cpu, affinity) - per_cpu(vector_irq, cpu)[desc->arch.old_vector] = ~irq; - spin_unlock(&vector_lock); - - release_old_vec(desc); - desc->arch.move_in_progress = 0; + spin_unlock(&desc->lock); + continue; } if ( !cpumask_intersects(mask, desc->affinity) ) @@ -2624,6 +2720,14 @@ if ( desc->handler->disable ) desc->handler->disable(desc); + /* + * If the current CPU is going offline and is (one of) the target(s) of + * the interrupt, signal to check whether there are any pending vectors + * to be handled in the local APIC after the interrupt has been moved. + */ + if ( !cpu_online(cpu) && cpumask_test_cpu(cpu, desc->arch.cpu_mask) ) + check_irr = true; + if ( desc->handler->set_affinity ) desc->handler->set_affinity(desc, affinity); else if ( !(warned++) ) @@ -2634,6 +2738,18 @@ cpumask_copy(affinity, desc->affinity); + if ( check_irr && apic_irr_read(vector) ) + /* + * Forward pending interrupt to the new destination, this CPU is + * going offline and otherwise the interrupt would be lost. + * + * Do the IRR check as late as possible before releasing the irq + * desc in order for any in-flight interrupts to be delivered to + * the lapic. + */ + send_IPI_mask(cpumask_of(cpumask_any(desc->arch.cpu_mask)), + desc->arch.vector); + spin_unlock(&desc->lock); if ( !verbose ) @@ -2645,11 +2761,6 @@ printk("Broke affinity for IRQ%u, new: %*pb\n", irq, CPUMASK_PR(affinity)); } - - /* That doesn't seem sufficient. Give it 1ms. */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); } void fixup_eoi(void) @@ -2803,6 +2914,7 @@ d->domain_id, index, pirq, current_pirq); if ( current_pirq < 0 ) return -EBUSY; + pirq = current_pirq; } else if ( type == MAP_PIRQ_TYPE_MULTI_MSI ) { diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/livepatch.c xen-4.17.5/xen/arch/x86/livepatch.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/livepatch.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/livepatch.c 2024-08-14 09:03:57.000000000 +0000 @@ -62,7 +62,7 @@ int noinline arch_livepatch_quiesce(void) { /* - * Relax perms on .text to be RWX, so we can modify them. + * Relax perms on .text/.rodata, so we can modify them. * * This relaxes perms globally, but all other CPUs are waiting on us. */ @@ -75,7 +75,7 @@ void noinline arch_livepatch_revive(void) { /* - * Reinstate perms on .text to be RX. This also cleans out the dirty + * Reinstate perms on .text/.rodata. This also cleans out the dirty * bits, which matters when CET Shstk is active. * * The other CPUs waiting for us could in principle have re-walked while diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/mm/hap/hap.c xen-4.17.5/xen/arch/x86/mm/hap/hap.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/mm/hap/hap.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/mm/hap/hap.c 2024-08-14 09:03:57.000000000 +0000 @@ -739,8 +739,7 @@ return 1; } -static pagetable_t cf_check hap_update_cr3( - struct vcpu *v, bool do_locking, bool noflush) +static pagetable_t cf_check hap_update_cr3(struct vcpu *v, bool noflush) { v->arch.hvm.hw_cr[3] = v->arch.hvm.guest_cr[3]; hvm_update_guest_cr3(v, noflush); @@ -826,7 +825,7 @@ } /* CR3 is effectively updated by a mode change. Flush ASIDs, etc. */ - hap_update_cr3(v, 0, false); + hap_update_cr3(v, false); unlock: paging_unlock(d); diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/mm/mm-locks.h xen-4.17.5/xen/arch/x86/mm/mm-locks.h --- xen-4.17.3+10-g091466ba55/xen/arch/x86/mm/mm-locks.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/mm/mm-locks.h 2024-08-14 09:03:57.000000000 +0000 @@ -86,8 +86,8 @@ this_cpu(mm_lock_level) = l; } -static inline void _mm_lock(const struct domain *d, mm_lock_t *l, - const char *func, int level, int rec) +static always_inline void _mm_lock(const struct domain *d, mm_lock_t *l, + const char *func, int level, int rec) { if ( !((mm_locked_by_me(l)) && rec) ) _check_lock_level(d, level); @@ -137,8 +137,8 @@ return (l->locker == get_processor_id()); } -static inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l, - const char *func, int level) +static always_inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l, + const char *func, int level) { if ( !mm_write_locked_by_me(l) ) { @@ -149,6 +149,8 @@ l->unlock_level = _get_lock_level(); _set_lock_level(_lock_level(d, level)); } + else + block_lock_speculation(); l->recurse_count++; } @@ -162,8 +164,8 @@ percpu_write_unlock(p2m_percpu_rwlock, &l->lock); } -static inline void _mm_read_lock(const struct domain *d, mm_rwlock_t *l, - int level) +static always_inline void _mm_read_lock(const struct domain *d, mm_rwlock_t *l, + int level) { _check_lock_level(d, level); percpu_read_lock(p2m_percpu_rwlock, &l->lock); @@ -178,15 +180,15 @@ /* This wrapper uses the line number to express the locking order below */ #define declare_mm_lock(name) \ - static inline void mm_lock_##name(const struct domain *d, mm_lock_t *l, \ - const char *func, int rec) \ + static always_inline void mm_lock_##name( \ + const struct domain *d, mm_lock_t *l, const char *func, int rec) \ { _mm_lock(d, l, func, MM_LOCK_ORDER_##name, rec); } #define declare_mm_rwlock(name) \ - static inline void mm_write_lock_##name(const struct domain *d, \ - mm_rwlock_t *l, const char *func) \ + static always_inline void mm_write_lock_##name( \ + const struct domain *d, mm_rwlock_t *l, const char *func) \ { _mm_write_lock(d, l, func, MM_LOCK_ORDER_##name); } \ - static inline void mm_read_lock_##name(const struct domain *d, \ - mm_rwlock_t *l) \ + static always_inline void mm_read_lock_##name(const struct domain *d, \ + mm_rwlock_t *l) \ { _mm_read_lock(d, l, MM_LOCK_ORDER_##name); } /* These capture the name of the calling function */ #define mm_lock(name, d, l) mm_lock_##name(d, l, __func__, 0) @@ -321,7 +323,7 @@ #define MM_LOCK_ORDER_altp2m 40 declare_mm_rwlock(altp2m); -static inline void p2m_lock(struct p2m_domain *p) +static always_inline void p2m_lock(struct p2m_domain *p) { if ( p2m_is_altp2m(p) ) mm_write_lock(altp2m, p->domain, &p->lock); @@ -345,6 +347,15 @@ #define p2m_locked_by_me(p) mm_write_locked_by_me(&(p)->lock) #define gfn_locked_by_me(p,g) p2m_locked_by_me(p) +static always_inline void gfn_lock_if(bool condition, struct p2m_domain *p2m, + gfn_t gfn, unsigned int order) +{ + if ( condition ) + gfn_lock(p2m, gfn, order); + else + block_lock_speculation(); +} + /* PoD lock (per-p2m-table) * * Protects private PoD data structs: entry and cache diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/mm/p2m-ept.c xen-4.17.5/xen/arch/x86/mm/p2m-ept.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/mm/p2m-ept.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/mm/p2m-ept.c 2024-08-14 09:03:57.000000000 +0000 @@ -512,12 +512,6 @@ return -1; } - if ( !mfn_valid(mfn) ) - { - *ipat = true; - return MTRR_TYPE_UNCACHABLE; - } - /* * Conditional must be kept in sync with the code in * {iomem,ioports}_{permit,deny}_access(). @@ -530,8 +524,12 @@ } for ( special_pgs = i = 0; i < (1ul << order); i++ ) - if ( is_special_page(mfn_to_page(mfn_add(mfn, i))) ) + { + mfn_t cur = mfn_add(mfn, i); + + if ( mfn_valid(cur) && is_special_page(mfn_to_page(cur)) ) special_pgs++; + } if ( special_pgs ) { @@ -658,6 +656,8 @@ if ( e.emt != MTRR_NUM_TYPES ) break; + ASSERT(is_epte_present(&e)); + if ( level == 0 ) { for ( gfn -= i, i = 0; i < EPT_PAGETABLE_ENTRIES; ++i ) @@ -923,17 +923,6 @@ if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) ) { - bool ipat; - int emt = epte_get_entry_emt(p2m->domain, _gfn(gfn), mfn, - i * EPT_TABLE_ORDER, &ipat, - p2mt); - - if ( emt >= 0 ) - new_entry.emt = emt; - else /* ept_handle_misconfig() will need to take care of this. */ - new_entry.emt = MTRR_NUM_TYPES; - - new_entry.ipat = ipat; new_entry.sp = !!i; new_entry.sa_p2mt = p2mt; new_entry.access = p2ma; @@ -949,6 +938,22 @@ need_modify_vtd_table = 0; ept_p2m_type_to_flags(p2m, &new_entry); + + if ( is_epte_present(&new_entry) ) + { + bool ipat; + int emt = epte_get_entry_emt(p2m->domain, _gfn(gfn), mfn, + i * EPT_TABLE_ORDER, &ipat, + p2mt); + + BUG_ON(mfn_eq(mfn, INVALID_MFN)); + + if ( emt >= 0 ) + new_entry.emt = emt; + else /* ept_handle_misconfig() will need to take care of this. */ + new_entry.emt = MTRR_NUM_TYPES; + new_entry.ipat = ipat; + } } if ( sve != -1 ) diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/mm/p2m-pod.c xen-4.17.5/xen/arch/x86/mm/p2m-pod.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/mm/p2m-pod.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/mm/p2m-pod.c 2024-08-14 09:03:57.000000000 +0000 @@ -36,7 +36,7 @@ #define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0) /* Enforce lock ordering when grabbing the "external" page_alloc lock */ -static inline void lock_page_alloc(struct p2m_domain *p2m) +static always_inline void lock_page_alloc(struct p2m_domain *p2m) { page_alloc_mm_pre_lock(p2m->domain); spin_lock(&(p2m->domain->page_alloc_lock)); @@ -1370,19 +1370,28 @@ } } + /* + * P2M update and stats increment need to collectively be under PoD lock, + * to prevent code elsewhere observing PoD entry count being zero despite + * there actually still being PoD entries (created by the p2m_set_entry() + * invocation below). + */ + pod_lock(p2m); + /* Now, actually do the two-way mapping */ rc = p2m_set_entry(p2m, gfn, INVALID_MFN, order, p2m_populate_on_demand, p2m->default_access); if ( rc == 0 ) { - pod_lock(p2m); p2m->pod.entry_count += 1UL << order; p2m->pod.entry_count -= pod_count; BUG_ON(p2m->pod.entry_count < 0); - pod_unlock(p2m); + } + + pod_unlock(p2m); + if ( rc == 0 ) ioreq_request_mapcache_invalidate(d); - } else if ( order ) { /* diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/mm/p2m.c xen-4.17.5/xen/arch/x86/mm/p2m.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/mm/p2m.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/mm/p2m.c 2024-08-14 09:03:57.000000000 +0000 @@ -292,9 +292,8 @@ if ( q & P2M_UNSHARE ) q |= P2M_ALLOC; - if ( locked ) - /* Grab the lock here, don't release until put_gfn */ - gfn_lock(p2m, gfn, 0); + /* Grab the lock here, don't release until put_gfn */ + gfn_lock_if(locked, p2m, gfn, 0); mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL); @@ -1266,9 +1265,11 @@ else { gfn_unlock(p2m, gfn, 0); - printk(XENLOG_G_WARNING - "non-identity map d%d:%lx not cleared (mapped to %lx)\n", - d->domain_id, gfn_l, mfn_x(mfn)); + if ( (p2mt != p2m_invalid && p2mt != p2m_mmio_dm) || + a != p2m_access_n || !mfn_eq(mfn, INVALID_MFN) ) + printk(XENLOG_G_WARNING + "non-identity map %pd:%lx not cleared (mapped to %lx)\n", + d, gfn_l, mfn_x(mfn)); ret = 0; } diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/mm/paging.c xen-4.17.5/xen/arch/x86/mm/paging.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/mm/paging.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/mm/paging.c 2024-08-14 09:03:57.000000000 +0000 @@ -779,7 +779,7 @@ if ( d == NULL ) return -ESRCH; - ret = xsm_domctl(XSM_OTHER, d, op.cmd); + ret = xsm_domctl(XSM_OTHER, d, op.cmd, 0 /* SSIDref not applicable */); if ( !ret ) { if ( domctl_lock_acquire() ) diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/mm/shadow/common.c xen-4.17.5/xen/arch/x86/mm/shadow/common.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/mm/shadow/common.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/mm/shadow/common.c 2024-08-14 09:03:57.000000000 +0000 @@ -2579,7 +2579,7 @@ } #endif /* OOS */ - v->arch.paging.mode->update_cr3(v, 0, false); + v->arch.paging.mode->update_cr3(v, false); } void cf_check shadow_update_paging_modes(struct vcpu *v) diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/mm/shadow/multi.c xen-4.17.5/xen/arch/x86/mm/shadow/multi.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/mm/shadow/multi.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/mm/shadow/multi.c 2024-08-14 09:03:57.000000000 +0000 @@ -2506,7 +2506,7 @@ * In any case, in the PAE case, the ASSERT is not true; it can * happen because of actions the guest is taking. */ #if GUEST_PAGING_LEVELS == 3 - v->arch.paging.mode->update_cr3(v, 0, false); + v->arch.paging.mode->update_cr3(v, false); #else ASSERT(d->is_shutting_down); #endif @@ -3224,17 +3224,13 @@ } } -static pagetable_t cf_check sh_update_cr3(struct vcpu *v, bool do_locking, - bool noflush) +static pagetable_t cf_check sh_update_cr3(struct vcpu *v, bool noflush) /* Updates vcpu->arch.cr3 after the guest has changed CR3. * Paravirtual guests should set v->arch.guest_table (and guest_table_user, * if appropriate). * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works; * this function will call hvm_update_guest_cr(v, 3) to tell them where the * shadow tables are. - * If do_locking != 0, assume we are being called from outside the - * shadow code, and must take and release the paging lock; otherwise - * that is the caller's responsibility. */ { struct domain *d = v->domain; @@ -3252,7 +3248,11 @@ return old_entry; } - if ( do_locking ) paging_lock(v->domain); + /* + * This is used externally (with the paging lock not taken) and internally + * by the shadow code (with the lock already taken). + */ + paging_lock_recursive(v->domain); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Need to resync all the shadow entries on a TLB flush. Resync @@ -3480,8 +3480,7 @@ shadow_sync_other_vcpus(v); #endif - /* Release the lock, if we took it (otherwise it's the caller's problem) */ - if ( do_locking ) paging_unlock(v->domain); + paging_unlock(v->domain); return old_entry; } diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/mm/shadow/none.c xen-4.17.5/xen/arch/x86/mm/shadow/none.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/mm/shadow/none.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/mm/shadow/none.c 2024-08-14 09:03:57.000000000 +0000 @@ -52,8 +52,7 @@ } #endif -static pagetable_t cf_check _update_cr3(struct vcpu *v, bool do_locking, - bool noflush) +static pagetable_t cf_check _update_cr3(struct vcpu *v, bool noflush) { ASSERT_UNREACHABLE(); return pagetable_null(); diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/mm.c xen-4.17.5/xen/arch/x86/mm.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/mm.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/mm.c 2024-08-14 09:03:57.000000000 +0000 @@ -2033,7 +2033,7 @@ #define current_locked_page_ne_check(x) true #endif -int page_lock(struct page_info *page) +int page_lock_unsafe(struct page_info *page) { unsigned long x, nx; @@ -2094,7 +2094,7 @@ * l3t_lock(), so to avoid deadlock we must avoid grabbing them in * reverse order. */ -static void l3t_lock(struct page_info *page) +static always_inline void l3t_lock(struct page_info *page) { unsigned long x, nx; @@ -2103,6 +2103,8 @@ cpu_relax(); nx = x | PGT_locked; } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x ); + + block_lock_speculation(); } static void l3t_unlock(struct page_info *page) @@ -5021,8 +5023,7 @@ if ( !l3t ) return NULL; UNMAP_DOMAIN_PAGE(l3t); - if ( locking ) - spin_lock(&map_pgdir_lock); + spin_lock_if(locking, &map_pgdir_lock); if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) ) { l4_pgentry_t l4e = l4e_from_mfn(l3mfn, __PAGE_HYPERVISOR); @@ -5059,8 +5060,7 @@ return NULL; } UNMAP_DOMAIN_PAGE(l2t); - if ( locking ) - spin_lock(&map_pgdir_lock); + spin_lock_if(locking, &map_pgdir_lock); if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) { l3e_write(pl3e, l3e_from_mfn(l2mfn, __PAGE_HYPERVISOR)); @@ -5098,8 +5098,7 @@ return NULL; } UNMAP_DOMAIN_PAGE(l1t); - if ( locking ) - spin_lock(&map_pgdir_lock); + spin_lock_if(locking, &map_pgdir_lock); if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ) { l2e_write(pl2e, l2e_from_mfn(l1mfn, __PAGE_HYPERVISOR)); @@ -5130,6 +5129,8 @@ do { \ if ( locking ) \ l3t_lock(page); \ + else \ + block_lock_speculation(); \ } while ( false ) #define L3T_UNLOCK(page) \ @@ -5345,8 +5346,7 @@ if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL ) flush_flags |= FLUSH_TLB_GLOBAL; - if ( locking ) - spin_lock(&map_pgdir_lock); + spin_lock_if(locking, &map_pgdir_lock); if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) && (l3e_get_flags(*pl3e) & _PAGE_PSE) ) { @@ -5450,8 +5450,7 @@ if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL ) flush_flags |= FLUSH_TLB_GLOBAL; - if ( locking ) - spin_lock(&map_pgdir_lock); + spin_lock_if(locking, &map_pgdir_lock); if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) && (l2e_get_flags(*pl2e) & _PAGE_PSE) ) { @@ -5492,8 +5491,7 @@ unsigned long base_mfn; const l1_pgentry_t *l1t; - if ( locking ) - spin_lock(&map_pgdir_lock); + spin_lock_if(locking, &map_pgdir_lock); ol2e = *pl2e; /* @@ -5547,8 +5545,7 @@ unsigned long base_mfn; const l2_pgentry_t *l2t; - if ( locking ) - spin_lock(&map_pgdir_lock); + spin_lock_if(locking, &map_pgdir_lock); ol3e = *pl3e; /* @@ -5692,8 +5689,7 @@ l3e_get_flags(*pl3e))); UNMAP_DOMAIN_PAGE(l2t); - if ( locking ) - spin_lock(&map_pgdir_lock); + spin_lock_if(locking, &map_pgdir_lock); if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) && (l3e_get_flags(*pl3e) & _PAGE_PSE) ) { @@ -5752,8 +5748,7 @@ l2e_get_flags(*pl2e) & ~_PAGE_PSE)); UNMAP_DOMAIN_PAGE(l1t); - if ( locking ) - spin_lock(&map_pgdir_lock); + spin_lock_if(locking, &map_pgdir_lock); if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) && (l2e_get_flags(*pl2e) & _PAGE_PSE) ) { @@ -5797,8 +5792,7 @@ */ if ( (nf & _PAGE_PRESENT) || ((v != e) && (l1_table_offset(v) != 0)) ) continue; - if ( locking ) - spin_lock(&map_pgdir_lock); + spin_lock_if(locking, &map_pgdir_lock); /* * L2E may be already cleared, or set to a superpage, by @@ -5845,8 +5839,7 @@ if ( (nf & _PAGE_PRESENT) || ((v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0)) ) continue; - if ( locking ) - spin_lock(&map_pgdir_lock); + spin_lock_if(locking, &map_pgdir_lock); /* * L3E may be already cleared, or set to a superpage, by @@ -5963,7 +5956,7 @@ v += 1UL << L1_PAGETABLE_SHIFT; - if ( l2_table_offset(v) == 0 ) + if ( l1_table_offset(v) == 0 ) break; } diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/msi.c xen-4.17.5/xen/arch/x86/msi.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/msi.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/msi.c 2024-08-14 09:03:57.000000000 +0000 @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -1428,6 +1429,9 @@ unsigned long flags; const char *type = "???"; + if ( !(irq & 0x1f) ) + process_pending_softirqs(); + if ( !irq_desc_initialized(desc) ) continue; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/msr.c xen-4.17.5/xen/arch/x86/msr.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/msr.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/msr.c 2024-08-14 09:03:57.000000000 +0000 @@ -316,8 +316,8 @@ /* * Caller to confirm that MSR_SPEC_CTRL is available. Intel and AMD have - * separate CPUID features for this functionality, but only set will be - * active. + * separate CPUID features for some of this functionality, but only one + * vendors-worth will be active on a single host. */ uint64_t msr_spec_ctrl_valid_bits(const struct cpu_policy *cp) { @@ -331,6 +331,11 @@ return (SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | (ssbd ? SPEC_CTRL_SSBD : 0) | (psfd ? SPEC_CTRL_PSFD : 0) | + (cp->feat.ipred_ctrl + ? (SPEC_CTRL_IPRED_DIS_U | SPEC_CTRL_IPRED_DIS_S) : 0) | + (cp->feat.rrsba_ctrl + ? (SPEC_CTRL_RRSBA_DIS_U | SPEC_CTRL_RRSBA_DIS_S) : 0) | + (cp->feat.bhi_ctrl ? SPEC_CTRL_BHI_DIS_S : 0) | 0); } diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/nmi.c xen-4.17.5/xen/arch/x86/nmi.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/nmi.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/nmi.c 2024-08-14 09:03:57.000000000 +0000 @@ -150,6 +150,8 @@ static void __init cf_check wait_for_nmis(void *p) { + cpumask_t *stuck_cpus = p; + unsigned int cpu = smp_processor_id(); unsigned int start_count = this_cpu(nmi_count); unsigned long ticks = 10 * 1000 * cpu_khz / nmi_hz; unsigned long s, e; @@ -158,42 +160,35 @@ do { cpu_relax(); if ( this_cpu(nmi_count) >= start_count + 2 ) - break; + return; + e = rdtsc(); - } while( e - s < ticks ); + } while ( e - s < ticks ); + + /* Timeout. Mark ourselves as stuck. */ + cpumask_set_cpu(cpu, stuck_cpus); } void __init check_nmi_watchdog(void) { - static unsigned int __initdata prev_nmi_count[NR_CPUS]; - int cpu; - bool ok = true; + static cpumask_t __initdata stuck_cpus; if ( nmi_watchdog == NMI_NONE ) return; printk("Testing NMI watchdog on all CPUs:"); - for_each_online_cpu ( cpu ) - prev_nmi_count[cpu] = per_cpu(nmi_count, cpu); - /* * Wait at most 10 ticks for 2 watchdog NMIs on each CPU. * Busy-wait on all CPUs: the LAPIC counter that the NMI watchdog * uses only runs while the core's not halted */ - on_selected_cpus(&cpu_online_map, wait_for_nmis, NULL, 1); - - for_each_online_cpu ( cpu ) - { - if ( per_cpu(nmi_count, cpu) - prev_nmi_count[cpu] < 2 ) - { - printk(" %d", cpu); - ok = false; - } - } + on_selected_cpus(&cpu_online_map, wait_for_nmis, &stuck_cpus, 1); - printk(" %s\n", ok ? "ok" : "stuck"); + if ( cpumask_empty(&stuck_cpus) ) + printk("ok\n"); + else + printk("{%*pbl} stuck\n", CPUMASK_PR(&stuck_cpus)); /* * Now that we know it works we can reduce NMI frequency to @@ -323,8 +318,6 @@ { unsigned int evntsel; - nmi_perfctr_msr = MSR_P6_PERFCTR(0); - if ( !nmi_p6_event_width && current_cpu_data.cpuid_level >= 0xa ) nmi_p6_event_width = MASK_EXTR(cpuid_eax(0xa), P6_EVENT_WIDTH_MASK); if ( !nmi_p6_event_width ) @@ -334,6 +327,8 @@ nmi_p6_event_width > BITS_PER_LONG ) return; + nmi_perfctr_msr = MSR_P6_PERFCTR(0); + clear_msr_range(MSR_P6_EVNTSEL(0), 2); clear_msr_range(MSR_P6_PERFCTR(0), 2); @@ -349,13 +344,13 @@ wrmsr(MSR_P6_EVNTSEL(0), evntsel, 0); } -static int setup_p4_watchdog(void) +static void setup_p4_watchdog(void) { uint64_t misc_enable; rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (!(misc_enable & MSR_IA32_MISC_ENABLE_PERF_AVAIL)) - return 0; + return; nmi_perfctr_msr = MSR_P4_IQ_PERFCTR0; nmi_p4_cccr_val = P4_NMI_IQ_CCCR0; @@ -378,13 +373,12 @@ clear_msr_range(0x3E0, 2); clear_msr_range(MSR_P4_BPU_CCCR0, 18); clear_msr_range(MSR_P4_BPU_PERFCTR0, 18); - + wrmsrl(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0); wrmsrl(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE); write_watchdog_counter("P4_IQ_COUNTER0"); apic_write(APIC_LVTPC, APIC_DM_NMI); wrmsrl(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val); - return 1; } void setup_apic_nmi_watchdog(void) @@ -392,17 +386,12 @@ if ( nmi_watchdog == NMI_NONE ) return; - switch (boot_cpu_data.x86_vendor) { + switch ( boot_cpu_data.x86_vendor ) + { case X86_VENDOR_AMD: - switch (boot_cpu_data.x86) { - case 6: - case 0xf ... 0x19: - setup_k7_watchdog(); - break; - default: - return; - } + setup_k7_watchdog(); break; + case X86_VENDOR_INTEL: switch (boot_cpu_data.x86) { case 6: @@ -411,14 +400,16 @@ : CORE_EVENT_CPU_CLOCKS_NOT_HALTED); break; case 15: - if (!setup_p4_watchdog()) - return; + setup_p4_watchdog(); break; - default: - return; } break; - default: + } + + if ( nmi_perfctr_msr == 0 ) + { + printk(XENLOG_WARNING "Failed to configure NMI watchdog\n"); + nmi_watchdog = NMI_NONE; return; } diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/platform_hypercall.c xen-4.17.5/xen/arch/x86/platform_hypercall.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/platform_hypercall.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/platform_hypercall.c 2024-08-14 09:03:57.000000000 +0000 @@ -299,7 +299,7 @@ ret = -EINVAL; if ( op->u.read_memtype.reg < num_var_ranges ) { - mtrr_if->get(op->u.read_memtype.reg, &mfn, &nr_mfns, &type); + mtrr_get(op->u.read_memtype.reg, &mfn, &nr_mfns, &type); op->u.read_memtype.mfn = mfn; op->u.read_memtype.nr_mfns = nr_mfns; op->u.read_memtype.type = type; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/pv/dom0_build.c xen-4.17.5/xen/arch/x86/pv/dom0_build.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/pv/dom0_build.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/pv/dom0_build.c 2024-08-14 09:03:57.000000000 +0000 @@ -821,6 +821,8 @@ rc = elf_load_binary(&elf); if ( rc < 0 ) { + mapcache_override_current(NULL); + switch_cr3_cr4(current->arch.cr3, read_cr4()); printk("Failed to load the kernel binary\n"); goto out; } diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/pv/emul-priv-op.c xen-4.17.5/xen/arch/x86/pv/emul-priv-op.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/pv/emul-priv-op.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/pv/emul-priv-op.c 2024-08-14 09:03:57.000000000 +0000 @@ -124,7 +124,7 @@ /* Some platforms might need to quirk the stub for specific inputs. */ if ( unlikely(ioemul_handle_quirk) ) { - quirk_bytes = ioemul_handle_quirk(opcode, p, ctxt->ctxt.regs); + quirk_bytes = ioemul_handle_proliant_quirk(opcode, p, ctxt->ctxt.regs); p += quirk_bytes; } @@ -358,6 +358,8 @@ case DR_LEN_8: width = 8; break; } + start &= ~(width - 1UL); + if ( (start < (port + len)) && ((start + width) > port) ) match |= 1u << i; } diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/setup.c xen-4.17.5/xen/arch/x86/setup.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/setup.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/setup.c 2024-08-14 09:03:57.000000000 +0000 @@ -622,6 +622,10 @@ system_state = SYS_STATE_active; + /* Re-run stub recovery self-tests with CET-SS active. */ + if ( IS_ENABLED(CONFIG_DEBUG) && cpu_has_xen_shstk ) + stub_selftest(); + domain_unpause_by_systemcontroller(dom0); /* MUST be done prior to removing .init data. */ @@ -1984,7 +1988,7 @@ if ( bsp_delay_spec_ctrl ) { - info->spec_ctrl_flags &= ~SCF_use_shadow; + info->scf &= ~SCF_use_shadow; barrier(); wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl); info->last_spec_ctrl = default_xen_spec_ctrl; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/smp.c xen-4.17.5/xen/arch/x86/smp.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/smp.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/smp.c 2024-08-14 09:03:57.000000000 +0000 @@ -89,7 +89,7 @@ * the system have been accounted for. */ if ( system_state > SYS_STATE_smp_boot && - !unaccounted_cpus && !disabled_cpus && + !unaccounted_cpus && !disabled_cpus && !cpu_in_hotplug_context() && /* NB: get_cpu_maps lock requires enabled interrupts. */ local_irq_is_enabled() && (cpus_locked = get_cpu_maps()) && (park_offline_cpus || diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/spec_ctrl.c xen-4.17.5/xen/arch/x86/spec_ctrl.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/spec_ctrl.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/spec_ctrl.c 2024-08-14 09:03:57.000000000 +0000 @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -37,13 +38,24 @@ static bool __initdata opt_msr_sc_hvm = true; static int8_t __initdata opt_rsb_pv = -1; static bool __initdata opt_rsb_hvm = true; -static int8_t __ro_after_init opt_md_clear_pv = -1; -static int8_t __ro_after_init opt_md_clear_hvm = -1; +static int8_t __ro_after_init opt_verw_pv = -1; +static int8_t __ro_after_init opt_verw_hvm = -1; static int8_t __ro_after_init opt_ibpb_entry_pv = -1; static int8_t __ro_after_init opt_ibpb_entry_hvm = -1; static bool __ro_after_init opt_ibpb_entry_dom0; +static int8_t __ro_after_init opt_bhb_entry_pv = -1; +static int8_t __ro_after_init opt_bhb_entry_hvm = -1; +static bool __ro_after_init opt_bhb_entry_dom0; +static enum bhb_thunk { + BHB_DEFAULT, + BHB_NONE, + BHB_TSX, + BHB_SHORT, + BHB_LONG, +} opt_bhb_seq __initdata; + /* Cmdline controls for Xen's speculative settings. */ static enum ind_thunk { THUNK_DEFAULT, /* Decide which thunk to use at boot time. */ @@ -58,18 +70,21 @@ int8_t __initdata opt_stibp = -1; bool __ro_after_init opt_ssbd; int8_t __initdata opt_psfd = -1; +int8_t __ro_after_init opt_bhi_dis_s = -1; int8_t __ro_after_init opt_ibpb_ctxt_switch = -1; -int8_t __read_mostly opt_eager_fpu = -1; -int8_t __read_mostly opt_l1d_flush = -1; -static bool __initdata opt_branch_harden = true; +int8_t __ro_after_init opt_eager_fpu = -1; +int8_t __ro_after_init opt_l1d_flush = -1; +static bool __initdata opt_branch_harden = + IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH); +static bool __initdata opt_lock_harden; bool __initdata bsp_delay_spec_ctrl; -uint8_t __read_mostly default_xen_spec_ctrl; -uint8_t __read_mostly default_spec_ctrl_flags; +unsigned int __ro_after_init default_xen_spec_ctrl; +uint8_t __ro_after_init default_scf; -paddr_t __read_mostly l1tf_addr_mask, __read_mostly l1tf_safe_maddr; -bool __read_mostly cpu_has_bug_l1tf; +paddr_t __ro_after_init l1tf_addr_mask, __ro_after_init l1tf_safe_maddr; +bool __ro_after_init cpu_has_bug_l1tf; static unsigned int __initdata l1d_maxphysaddr; static bool __initdata cpu_has_bug_msbds_only; /* => minimal HT impact. */ @@ -77,7 +92,7 @@ static int8_t __initdata opt_srb_lock = -1; static bool __initdata opt_unpriv_mmio; -static bool __ro_after_init opt_fb_clear_mmio; +static bool __ro_after_init opt_verw_mmio; static int8_t __initdata opt_gds_mit = -1; static int8_t __initdata opt_div_scrub = -1; @@ -119,18 +134,23 @@ disable_common: opt_rsb_pv = false; opt_rsb_hvm = false; - opt_md_clear_pv = 0; - opt_md_clear_hvm = 0; + opt_verw_pv = 0; + opt_verw_hvm = 0; opt_ibpb_entry_pv = 0; opt_ibpb_entry_hvm = 0; opt_ibpb_entry_dom0 = false; + opt_bhb_entry_pv = 0; + opt_bhb_entry_hvm = 0; + opt_bhb_entry_dom0 = false; opt_thunk = THUNK_JMP; + opt_bhb_seq = BHB_NONE; opt_ibrs = 0; opt_ibpb_ctxt_switch = false; opt_ssbd = false; opt_l1d_flush = 0; opt_branch_harden = false; + opt_lock_harden = false; opt_srb_lock = 0; opt_unpriv_mmio = false; opt_gds_mit = 0; @@ -151,15 +171,17 @@ { opt_msr_sc_pv = val; opt_rsb_pv = val; - opt_md_clear_pv = val; + opt_verw_pv = val; opt_ibpb_entry_pv = val; + opt_bhb_entry_pv = val; } else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) { opt_msr_sc_hvm = val; opt_rsb_hvm = val; - opt_md_clear_hvm = val; + opt_verw_hvm = val; opt_ibpb_entry_hvm = val; + opt_bhb_entry_hvm = val; } else if ( (val = parse_boolean("msr-sc", s, ss)) != -1 ) { @@ -203,21 +225,22 @@ break; } } - else if ( (val = parse_boolean("md-clear", s, ss)) != -1 ) + else if ( (val = parse_boolean("verw", s, ss)) != -1 || + (val = parse_boolean("md-clear", s, ss)) != -1 ) { switch ( val ) { case 0: case 1: - opt_md_clear_pv = opt_md_clear_hvm = val; + opt_verw_pv = opt_verw_hvm = val; break; case -2: - s += strlen("md-clear="); + s += (*s == 'v') ? strlen("verw=") : strlen("md-clear="); if ( (val = parse_boolean("pv", s, ss)) >= 0 ) - opt_md_clear_pv = val; + opt_verw_pv = val; else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) - opt_md_clear_hvm = val; + opt_verw_hvm = val; else default: rc = -EINVAL; @@ -246,13 +269,40 @@ break; } } + else if ( (val = parse_boolean("bhb-entry", s, ss)) != -1 ) + { + switch ( val ) + { + case 0: + case 1: + opt_bhb_entry_pv = opt_bhb_entry_hvm = + opt_bhb_entry_dom0 = val; + break; + + case -2: + s += strlen("bhb-entry="); + if ( (val = parse_boolean("pv", s, ss)) >= 0 ) + opt_bhb_entry_pv = val; + else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) + opt_bhb_entry_hvm = val; + else + default: + rc = -EINVAL; + break; + } + } /* Xen's speculative sidechannel mitigation settings. */ else if ( !strncmp(s, "bti-thunk=", 10) ) { s += 10; - if ( !cmdline_strcmp(s, "retpoline") ) + if ( !IS_ENABLED(CONFIG_INDIRECT_THUNK) ) + { + no_config_param("INDIRECT_THUNK", "spec-ctrl", s - 10, ss); + rc = -EINVAL; + } + else if ( !cmdline_strcmp(s, "retpoline") ) opt_thunk = THUNK_RETPOLINE; else if ( !cmdline_strcmp(s, "lfence") ) opt_thunk = THUNK_LFENCE; @@ -261,6 +311,21 @@ else rc = -EINVAL; } + else if ( !strncmp(s, "bhb-seq=", 8) ) + { + s += strlen("bhb-seq="); + + if ( !cmdline_strcmp(s, "none") ) + opt_bhb_seq = BHB_NONE; + else if ( !cmdline_strcmp(s, "tsx") ) + opt_bhb_seq = BHB_TSX; + else if ( !cmdline_strcmp(s, "short") ) + opt_bhb_seq = BHB_SHORT; + else if ( !cmdline_strcmp(s, "long") ) + opt_bhb_seq = BHB_LONG; + else + rc = -EINVAL; + } /* Bits in MSR_SPEC_CTRL. */ else if ( (val = parse_boolean("ibrs", s, ss)) >= 0 ) @@ -271,6 +336,8 @@ opt_ssbd = val; else if ( (val = parse_boolean("psfd", s, ss)) >= 0 ) opt_psfd = val; + else if ( (val = parse_boolean("bhi-dis-s", s, ss)) >= 0 ) + opt_bhi_dis_s = val; /* Misc settings. */ else if ( (val = parse_boolean("ibpb", s, ss)) >= 0 ) @@ -280,7 +347,26 @@ else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 ) opt_l1d_flush = val; else if ( (val = parse_boolean("branch-harden", s, ss)) >= 0 ) - opt_branch_harden = val; + { + if ( IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH) ) + opt_branch_harden = val; + else + { + no_config_param("SPECULATIVE_HARDEN_BRANCH", "spec-ctrl", s, + ss); + rc = -EINVAL; + } + } + else if ( (val = parse_boolean("lock-harden", s, ss)) >= 0 ) + { + if ( IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_LOCK) ) + opt_lock_harden = val; + else + { + no_config_param("SPECULATIVE_HARDEN_LOCK", "spec-ctrl", s, ss); + rc = -EINVAL; + } + } else if ( (val = parse_boolean("srb-lock", s, ss)) >= 0 ) opt_srb_lock = val; else if ( (val = parse_boolean("unpriv-mmio", s, ss)) >= 0 ) @@ -299,8 +385,8 @@ } custom_param("spec-ctrl", parse_spec_ctrl); -int8_t __read_mostly opt_xpti_hwdom = -1; -int8_t __read_mostly opt_xpti_domu = -1; +int8_t __ro_after_init opt_xpti_hwdom = -1; +int8_t __ro_after_init opt_xpti_domu = -1; static __init void xpti_init_default(void) { @@ -364,8 +450,8 @@ } custom_param("xpti", parse_xpti); -int8_t __read_mostly opt_pv_l1tf_hwdom = -1; -int8_t __read_mostly opt_pv_l1tf_domu = -1; +int8_t __ro_after_init opt_pv_l1tf_hwdom = -1; +int8_t __ro_after_init opt_pv_l1tf_domu = -1; static int __init cf_check parse_pv_l1tf(const char *s) { @@ -431,7 +517,7 @@ * Hardware read-only information, stating immunity to certain issues, or * suggestions of which mitigation to use. */ - printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "", (caps & ARCH_CAPS_EIBRS) ? " EIBRS" : "", (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", @@ -447,6 +533,7 @@ (caps & ARCH_CAPS_FB_CLEAR) ? " FB_CLEAR" : "", (caps & ARCH_CAPS_PBRSB_NO) ? " PBRSB_NO" : "", (caps & ARCH_CAPS_GDS_NO) ? " GDS_NO" : "", + (caps & ARCH_CAPS_RFDS_NO) ? " RFDS_NO" : "", (e8b & cpufeat_mask(X86_FEATURE_IBRS_ALWAYS)) ? " IBRS_ALWAYS" : "", (e8b & cpufeat_mask(X86_FEATURE_STIBP_ALWAYS)) ? " STIBP_ALWAYS" : "", (e8b & cpufeat_mask(X86_FEATURE_IBRS_FAST)) ? " IBRS_FAST" : "", @@ -457,7 +544,7 @@ (e21a & cpufeat_mask(X86_FEATURE_SRSO_NO)) ? " SRSO_NO" : ""); /* Hardware features which need driving to mitigate issues. */ - printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", (e8b & cpufeat_mask(X86_FEATURE_IBPB)) || (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBPB" : "", (e8b & cpufeat_mask(X86_FEATURE_IBRS)) || @@ -475,10 +562,15 @@ (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "", (caps & ARCH_CAPS_FB_CLEAR_CTRL) ? " FB_CLEAR_CTRL" : "", (caps & ARCH_CAPS_GDS_CTRL) ? " GDS_CTRL" : "", + (caps & ARCH_CAPS_RFDS_CLEAR) ? " RFDS_CLEAR" : "", (e21a & cpufeat_mask(X86_FEATURE_SBPB)) ? " SBPB" : ""); /* Compiled-in support which pertains to mitigations. */ - if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) ) + if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) || + IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_ARRAY) || + IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH) || + IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS) || + IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_LOCK) ) printk(" Compiled-in support:" #ifdef CONFIG_INDIRECT_THUNK " INDIRECT_THUNK" @@ -486,14 +578,32 @@ #ifdef CONFIG_SHADOW_PAGING " SHADOW_PAGING" #endif +#ifdef CONFIG_SPECULATIVE_HARDEN_ARRAY + " HARDEN_ARRAY" +#endif +#ifdef CONFIG_SPECULATIVE_HARDEN_BRANCH + " HARDEN_BRANCH" +#endif +#ifdef CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS + " HARDEN_GUEST_ACCESS" +#endif +#ifdef CONFIG_SPECULATIVE_HARDEN_LOCK + " HARDEN_LOCK" +#endif "\n"); /* Settings for Xen's protection, irrespective of guests. */ - printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s\n", - thunk == THUNK_NONE ? "N/A" : - thunk == THUNK_RETPOLINE ? "RETPOLINE" : - thunk == THUNK_LFENCE ? "LFENCE" : - thunk == THUNK_JMP ? "JMP" : "?", + printk(" Xen settings: %s%s%s%sSPEC_CTRL: %s%s%s%s%s%s, Other:%s%s%s%s%s%s%s\n", + thunk != THUNK_NONE ? "BTI-Thunk: " : "", + thunk == THUNK_NONE ? "" : + thunk == THUNK_RETPOLINE ? "RETPOLINE, " : + thunk == THUNK_LFENCE ? "LFENCE, " : + thunk == THUNK_JMP ? "JMP, " : "?, ", + opt_bhb_seq != BHB_NONE ? "BHB-Seq: " : "", + opt_bhb_seq == BHB_NONE ? "" : + opt_bhb_seq == BHB_TSX ? "TSX, " : + opt_bhb_seq == BHB_SHORT ? "SHORT, " : + opt_bhb_seq == BHB_LONG ? "LONG, " : "?, ", (!boot_cpu_has(X86_FEATURE_IBRSB) && !boot_cpu_has(X86_FEATURE_IBRS)) ? "No" : (default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-", @@ -506,16 +616,19 @@ (!boot_cpu_has(X86_FEATURE_PSFD) && !boot_cpu_has(X86_FEATURE_INTEL_PSFD)) ? "" : (default_xen_spec_ctrl & SPEC_CTRL_PSFD) ? " PSFD+" : " PSFD-", + !boot_cpu_has(X86_FEATURE_BHI_CTRL) ? "" : + (default_xen_spec_ctrl & SPEC_CTRL_BHI_DIS_S) ? " BHI_DIS_S+" : " BHI_DIS_S-", !(caps & ARCH_CAPS_TSX_CTRL) ? "" : (opt_tsx & 1) ? " TSX+" : " TSX-", !cpu_has_srbds_ctrl ? "" : opt_srb_lock ? " SRB_LOCK+" : " SRB_LOCK-", opt_ibpb_ctxt_switch ? " IBPB-ctxt" : "", opt_l1d_flush ? " L1D_FLUSH" : "", - opt_md_clear_pv || opt_md_clear_hvm || - opt_fb_clear_mmio ? " VERW" : "", + opt_verw_pv || opt_verw_hvm || + opt_verw_mmio ? " VERW" : "", opt_div_scrub ? " DIV" : "", - opt_branch_harden ? " BRANCH_HARDEN" : ""); + opt_branch_harden ? " BRANCH_HARDEN" : "", + opt_lock_harden ? " LOCK_HARDEN" : ""); /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */ if ( cpu_has_bug_l1tf || opt_pv_l1tf_hwdom || opt_pv_l1tf_domu ) @@ -529,32 +642,35 @@ * mitigation support for guests. */ #ifdef CONFIG_HVM - printk(" Support for HVM VMs:%s%s%s%s%s%s%s\n", + printk(" Support for HVM VMs:%s%s%s%s%s%s%s%s\n", (boot_cpu_has(X86_FEATURE_SC_MSR_HVM) || boot_cpu_has(X86_FEATURE_SC_RSB_HVM) || boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) || - amd_virt_spec_ctrl || - opt_eager_fpu || opt_md_clear_hvm) ? "" : " None", + opt_bhb_entry_hvm || amd_virt_spec_ctrl || + opt_eager_fpu || opt_verw_hvm) ? "" : " None", boot_cpu_has(X86_FEATURE_SC_MSR_HVM) ? " MSR_SPEC_CTRL" : "", (boot_cpu_has(X86_FEATURE_SC_MSR_HVM) || amd_virt_spec_ctrl) ? " MSR_VIRT_SPEC_CTRL" : "", boot_cpu_has(X86_FEATURE_SC_RSB_HVM) ? " RSB" : "", opt_eager_fpu ? " EAGER_FPU" : "", - opt_md_clear_hvm ? " MD_CLEAR" : "", - boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) ? " IBPB-entry" : ""); + opt_verw_hvm ? " VERW" : "", + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) ? " IBPB-entry" : "", + opt_bhb_entry_hvm ? " BHB-entry" : ""); #endif #ifdef CONFIG_PV - printk(" Support for PV VMs:%s%s%s%s%s%s\n", + printk(" Support for PV VMs:%s%s%s%s%s%s%s\n", (boot_cpu_has(X86_FEATURE_SC_MSR_PV) || boot_cpu_has(X86_FEATURE_SC_RSB_PV) || boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) || - opt_eager_fpu || opt_md_clear_pv) ? "" : " None", + opt_bhb_entry_pv || + opt_eager_fpu || opt_verw_pv) ? "" : " None", boot_cpu_has(X86_FEATURE_SC_MSR_PV) ? " MSR_SPEC_CTRL" : "", boot_cpu_has(X86_FEATURE_SC_RSB_PV) ? " RSB" : "", opt_eager_fpu ? " EAGER_FPU" : "", - opt_md_clear_pv ? " MD_CLEAR" : "", - boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) ? " IBPB-entry" : ""); + opt_verw_pv ? " VERW" : "", + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) ? " IBPB-entry" : "", + opt_bhb_entry_pv ? " BHB-entry" : ""); printk(" XPTI (64-bit PV only): Dom0 %s, DomU %s (with%s PCID)\n", opt_xpti_hwdom ? "enabled" : "disabled", @@ -1068,7 +1184,7 @@ * NMI/#MC, so can't interrupt Xen ahead of having already flushed the * BTB. */ - default_spec_ctrl_flags |= SCF_ist_ibpb; + default_scf |= SCF_ist_ibpb; } if ( opt_ibpb_entry_hvm ) setup_force_cpu_cap(X86_FEATURE_IBPB_ENTRY_HVM); @@ -1330,6 +1446,83 @@ } } +/* + * Register File Data Sampling affects Atom cores from the Goldmont to + * Gracemont microarchitectures. The March 2024 microcode adds RFDS_NO to + * some but not all unaffected parts, and RFDS_CLEAR to affected parts still + * in support. + * + * Alder Lake and Raptor Lake client CPUs have a mix of P cores + * (Golden/Raptor Cove, not vulnerable) and E cores (Gracemont, + * vulnerable), and both enumerate RFDS_CLEAR. + * + * Both exist in a Xeon SKU, which has the E cores (Gracemont) disabled by + * platform configuration, and enumerate RFDS_NO. + * + * With older parts, or with out-of-date microcode, synthesise RFDS_NO when + * safe to do so. + * + * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/register-file-data-sampling.html + */ +static void __init rfds_calculations(void) +{ + /* RFDS is only known to affect Intel Family 6 processors at this time. */ + if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + boot_cpu_data.x86 != 6 ) + return; + + /* + * If RFDS_NO or RFDS_CLEAR are visible, we've either got suitable + * microcode, or an RFDS-aware hypervisor is levelling us in a pool. + */ + if ( cpu_has_rfds_no || cpu_has_rfds_clear ) + return; + + /* If we're virtualised, don't attempt to synthesise RFDS_NO. */ + if ( cpu_has_hypervisor ) + return; + + /* + * Not all CPUs are expected to get a microcode update enumerating one of + * RFDS_{NO,CLEAR}, or we might have out-of-date microcode. + */ + switch ( boot_cpu_data.x86_model ) + { + case INTEL_FAM6_ALDERLAKE: + case INTEL_FAM6_RAPTORLAKE: + /* + * Alder Lake and Raptor Lake might be a client SKU (with the + * Gracemont cores active, and therefore vulnerable) or might be a + * server SKU (with the Gracemont cores disabled, and therefore not + * vulnerable). + * + * See if the CPU identifies as hybrid to distinguish the two cases. + */ + if ( !cpu_has_hybrid ) + break; + fallthrough; + case INTEL_FAM6_ALDERLAKE_L: + case INTEL_FAM6_RAPTORLAKE_P: + case INTEL_FAM6_RAPTORLAKE_S: + + case INTEL_FAM6_ATOM_GOLDMONT: /* Apollo Lake */ + case INTEL_FAM6_ATOM_GOLDMONT_D: /* Denverton */ + case INTEL_FAM6_ATOM_GOLDMONT_PLUS: /* Gemini Lake */ + case INTEL_FAM6_ATOM_TREMONT_D: /* Snow Ridge / Parker Ridge */ + case INTEL_FAM6_ATOM_TREMONT: /* Elkhart Lake */ + case INTEL_FAM6_ATOM_TREMONT_L: /* Jasper Lake */ + case INTEL_FAM6_ATOM_GRACEMONT: /* Alder Lake N */ + return; + } + + /* + * We appear to be on an unaffected CPU which didn't enumerate RFDS_NO, + * perhaps because of it's age or because of out-of-date microcode. + * Synthesise it. + */ + setup_force_cpu_cap(X86_FEATURE_RFDS_NO); +} + static bool __init cpu_has_gds(void) { /* @@ -1482,19 +1675,111 @@ } } +/* + * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/branch-history-injection.html + */ +static bool __init cpu_has_bug_bhi(void) +{ + /* BHI is only known to affect Intel Family 6 processors at this time. */ + if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + boot_cpu_data.x86 != 6 ) + return false; + + if ( boot_cpu_has(X86_FEATURE_BHI_NO) ) + return false; + + if ( cpu_has_hypervisor ) + return true; /* TODO: how to figure out out if we're really eIBRS levelled out? */ + + return cpu_has_eibrs; +} + +static void __init bhi_calculations(void) +{ + bool has_bhi = cpu_has_bug_bhi(); + + /* + * To mitigate BHI, we want to use BHI_DIS_S wherever possible, or the + * short sequence otherwise. Other forms are available on request. + * + * We are repsonsbile for performing default-conversion on opt_bhi_dis_s + * and opt_bhb_seq, irrespective of succeptibility to BHI. + */ + + if ( opt_bhi_dis_s == -1 ) + opt_bhi_dis_s = has_bhi; + + if ( !boot_cpu_has(X86_FEATURE_BHI_CTRL) ) + opt_bhi_dis_s = false; + + if ( opt_bhi_dis_s ) + default_xen_spec_ctrl |= SPEC_CTRL_BHI_DIS_S; + + if ( opt_bhb_seq == BHB_DEFAULT ) + { + /* + * If we're using BHI_DIS_S, or we're not succeptable, don't activate + * the thunks. + */ + if ( !has_bhi || opt_bhi_dis_s ) + opt_bhb_seq = BHB_NONE; + else + opt_bhb_seq = BHB_SHORT; + } + + /* + * We can use the TSX even if it's disabled for e.g. TAA reasons. + * However, fall back to the loop sequence if there is no trace of RTM at + * all, as XBEGIN will #UD. + */ + if ( opt_bhb_seq == BHB_TSX && !cpu_has_rtm && !cpu_has_rtm_always_abort && + !cpu_has_tsx_force_abort ) + opt_bhb_seq = BHB_SHORT; + + /* + * Only activate SCF_entry_bhb by for guests if a sequence is in place. + */ + if ( opt_bhb_entry_pv == -1 ) + opt_bhb_entry_pv = has_bhi && opt_bhb_seq != BHB_NONE; + if ( opt_bhb_entry_hvm == -1 ) + opt_bhb_entry_hvm = has_bhi && opt_bhb_seq != BHB_NONE; + + switch ( opt_bhb_seq ) + { + case BHB_LONG: + setup_force_cpu_cap(X86_SPEC_BHB_LOOPS_LONG); + fallthrough; + + case BHB_SHORT: + setup_force_cpu_cap(X86_SPEC_BHB_LOOPS); + break; + + case BHB_TSX: + setup_force_cpu_cap(X86_SPEC_BHB_TSX); + break; + + default: + break; + } +} + void spec_ctrl_init_domain(struct domain *d) { bool pv = is_pv_domain(d); - bool verw = ((pv ? opt_md_clear_pv : opt_md_clear_hvm) || - (opt_fb_clear_mmio && is_iommu_enabled(d))); + bool verw = ((pv ? opt_verw_pv : opt_verw_hvm) || + (opt_verw_mmio && is_iommu_enabled(d))); bool ibpb = ((pv ? opt_ibpb_entry_pv : opt_ibpb_entry_hvm) && (d->domain_id != 0 || opt_ibpb_entry_dom0)); - d->arch.spec_ctrl_flags = + bool bhb = ((pv ? opt_bhb_entry_pv : opt_bhb_entry_hvm) && + (d->domain_id != 0 || opt_bhb_entry_dom0)); + + d->arch.scf = (verw ? SCF_verw : 0) | (ibpb ? SCF_entry_ibpb : 0) | + (bhb ? SCF_entry_bhb : 0) | 0; } @@ -1502,7 +1787,7 @@ { enum ind_thunk thunk = THUNK_DEFAULT; bool has_spec_ctrl, ibrs = false, hw_smt_enabled; - bool cpu_has_bug_taa, retpoline_safe; + bool cpu_has_bug_taa, cpu_has_useful_md_clear, retpoline_safe; hw_smt_enabled = check_smt_enabled(); @@ -1597,7 +1882,7 @@ { if ( opt_msr_sc_pv ) { - default_spec_ctrl_flags |= SCF_ist_sc_msr; + default_scf |= SCF_ist_sc_msr; setup_force_cpu_cap(X86_FEATURE_SC_MSR_PV); } @@ -1608,7 +1893,7 @@ * Xen's value is not restored atomically. An early NMI hitting * the VMExit path needs to restore Xen's value for safety. */ - default_spec_ctrl_flags |= SCF_ist_sc_msr; + default_scf |= SCF_ist_sc_msr; setup_force_cpu_cap(X86_FEATURE_SC_MSR_HVM); } } @@ -1743,7 +2028,7 @@ if ( opt_rsb_pv ) { setup_force_cpu_cap(X86_FEATURE_SC_RSB_PV); - default_spec_ctrl_flags |= SCF_ist_rsb; + default_scf |= SCF_ist_rsb; } /* @@ -1766,7 +2051,7 @@ * possible rogue RSB speculation. */ if ( !cpu_has_svm ) - default_spec_ctrl_flags |= SCF_ist_rsb; + default_scf |= SCF_ist_rsb; } srso_calculations(hw_smt_enabled); @@ -1779,7 +2064,7 @@ if ( opt_eager_fpu == -1 ) opt_eager_fpu = should_use_eager_fpu(); - /* (Re)init BSP state now that default_spec_ctrl_flags has been calculated. */ + /* (Re)init BSP state now that default_scf has been calculated. */ init_shadow_spec_ctrl_state(); /* @@ -1821,6 +2106,9 @@ if ( !opt_branch_harden ) setup_force_cpu_cap(X86_FEATURE_SC_NO_BRANCH_HARDEN); + if ( !opt_lock_harden ) + setup_force_cpu_cap(X86_FEATURE_SC_NO_LOCK_HARDEN); + /* * We do not disable HT by default on affected hardware. * @@ -1838,49 +2126,107 @@ "enabled. Please assess your configuration and choose an\n" "explicit 'smt=' setting. See XSA-273.\n"); + /* + * A brief summary of VERW-related changes. + * + * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/intel-analysis-microarchitectural-data-sampling.html + * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/processor-mmio-stale-data-vulnerabilities.html + * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/register-file-data-sampling.html + * + * Relevant ucodes: + * + * - May 2019, for MDS. Introduces the MD_CLEAR CPUID bit and VERW side + * effects to scrub Store/Load/Fill buffers as applicable. MD_CLEAR + * exists architecturally, even when the side effects have been removed. + * + * Use VERW to scrub on return-to-guest. Parts with L1D_FLUSH to + * mitigate L1TF have the same side effect, so no need to do both. + * + * Various Atoms suffer from Store-buffer sampling only. Store buffers + * are statically partitioned between non-idle threads, so scrubbing is + * wanted when going idle too. + * + * Load ports and Fill buffers are competitively shared between threads. + * SMT must be disabled for VERW scrubbing to be fully effective. + * + * - November 2019, for TAA. Extended VERW side effects to TSX-enabled + * MDS_NO parts. + * + * - February 2022, for Client TSX de-feature. Removed VERW side effects + * from Client CPUs only. + * + * - May 2022, for MMIO Stale Data. (Re)introduced Fill Buffer scrubbing + * on all MMIO-affected parts which didn't already have it for MDS + * reasons, enumerating FB_CLEAR on those parts only. + * + * If FB_CLEAR is enumerated, L1D_FLUSH does not have the same scrubbing + * side effects as VERW and cannot be used in its place. + * + * - March 2023, for RFDS. Enumerate RFDS_CLEAR to mean that VERW now + * scrubs non-architectural entries from certain register files. + */ mds_calculations(); + rfds_calculations(); /* - * Parts which enumerate FB_CLEAR are those which are post-MDS_NO and have - * reintroduced the VERW fill buffer flushing side effect because of a - * susceptibility to FBSDP. + * Parts which enumerate FB_CLEAR are those with now-updated microcode + * which weren't susceptible to the original MFBDS (and therefore didn't + * have Fill Buffer scrubbing side effects to begin with, or were Client + * MDS_NO non-TAA_NO parts where the scrubbing was removed), but have had + * the scrubbing reintroduced because of a susceptibility to FBSDP. * * If unprivileged guests have (or will have) MMIO mappings, we can * mitigate cross-domain leakage of fill buffer data by issuing VERW on - * the return-to-guest path. + * the return-to-guest path. This is only a token effort if SMT is + * active. */ if ( opt_unpriv_mmio ) - opt_fb_clear_mmio = cpu_has_fb_clear; + opt_verw_mmio = cpu_has_fb_clear; /* - * By default, enable PV and HVM mitigations on MDS-vulnerable hardware. - * This will only be a token effort for MLPDS/MFBDS when HT is enabled, - * but it is somewhat better than nothing. + * MD_CLEAR is enumerated architecturally forevermore, even after the + * scrubbing side effects have been removed. Create ourselves an version + * which expressed whether we think MD_CLEAR is having any useful side + * effect. */ - if ( opt_md_clear_pv == -1 ) - opt_md_clear_pv = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && - boot_cpu_has(X86_FEATURE_MD_CLEAR)); - if ( opt_md_clear_hvm == -1 ) - opt_md_clear_hvm = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && - boot_cpu_has(X86_FEATURE_MD_CLEAR)); + cpu_has_useful_md_clear = (cpu_has_md_clear && + (cpu_has_bug_mds || cpu_has_bug_msbds_only)); /* - * Enable MDS/MMIO defences as applicable. The Idle blocks need using if - * either the PV or HVM MDS defences are used, or if we may give MMIO - * access to untrusted guests. - * - * HVM is more complicated. The MD_CLEAR microcode extends L1D_FLUSH with - * equivalent semantics to avoid needing to perform both flushes on the - * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH (for - * MDS mitigations. L1D_FLUSH is not safe for MMIO mitigations.) + * By default, use VERW scrubbing on applicable hardware, if we think it's + * going to have an effect. This will only be a token effort for + * MLPDS/MFBDS when SMT is enabled. + */ + if ( opt_verw_pv == -1 ) + opt_verw_pv = cpu_has_useful_md_clear || cpu_has_rfds_clear; + + if ( opt_verw_hvm == -1 ) + opt_verw_hvm = cpu_has_useful_md_clear || cpu_has_rfds_clear; + + /* + * If SMT is active, and we're protecting against MDS or MMIO stale data, + * we need to scrub before going idle as well as on return to guest. + * Various pipeline resources are repartitioned amongst non-idle threads. * - * After calculating the appropriate idle setting, simplify - * opt_md_clear_hvm to mean just "should we VERW on the way into HVM - * guests", so spec_ctrl_init_domain() can calculate suitable settings. + * We don't need to scrub on idle for RFDS. There are no affected cores + * which support SMT, despite there being affected cores in hybrid systems + * which have SMT elsewhere in the platform. */ - if ( opt_md_clear_pv || opt_md_clear_hvm || opt_fb_clear_mmio ) + if ( ((cpu_has_useful_md_clear && (opt_verw_pv || opt_verw_hvm)) || + opt_verw_mmio) && hw_smt_enabled ) setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE); - opt_md_clear_hvm &= !cpu_has_skip_l1dfl && !opt_l1d_flush; + + /* + * After calculating the appropriate idle setting, simplify opt_verw_hvm + * to mean just "should we VERW on the way into HVM guests", so + * spec_ctrl_init_domain() can calculate suitable settings. + * + * It is only safe to use L1D_FLUSH in place of VERW when MD_CLEAR is the + * only *_CLEAR we can see. + */ + if ( opt_l1d_flush && cpu_has_md_clear && !cpu_has_fb_clear && + !cpu_has_rfds_clear ) + opt_verw_hvm = false; /* * Warn the user if they are on MLPDS/MFBDS-vulnerable hardware with HT @@ -1965,9 +2311,68 @@ gds_calculations(); + bhi_calculations(); + print_details(thunk); /* + * With the alternative blocks now chosen, see if we need any other + * adjustments for safety. + * + * We compile the LFENCE in, and patch it out if it's not needed. + * + * Notes: + * - SPEC_CTRL_ENTRY_FROM_SVM doesn't need an LFENCE because it has an + * unconditional STGI. + * - SPEC_CTRL_ENTRY_FROM_IST handles its own safety, without the use of + * alternatives. + * - DO_OVERWRITE_RSB has conditional branches in it, but it's an inline + * sequence. It is considered safe for uarch reasons. + */ + { + /* + * SPEC_CTRL_ENTRY_FROM_PV conditional safety + * + * A BHB sequence, if used, is a conditional action and last. If we + * have this, then we must have the LFENCE. + * + * Otherwise, DO_SPEC_CTRL_ENTRY (X86_FEATURE_SC_MSR_PV if used) is an + * unconditional WRMSR. If we do have it, or we're not using any + * prior conditional block, then it's safe to drop the LFENCE. + */ + if ( !opt_bhb_entry_pv && + (boot_cpu_has(X86_FEATURE_SC_MSR_PV) || + !boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV)) ) + setup_force_cpu_cap(X86_SPEC_NO_LFENCE_ENTRY_PV); + + /* + * SPEC_CTRL_ENTRY_FROM_INTR conditional safety + * + * A BHB sequence, if used, is a conditional action and last. If we + * have this, then we must have the LFENCE. + * + * Otherwise DO_SPEC_CTRL_ENTRY (X86_FEATURE_SC_MSR_PV if used) is an + * unconditional WRMSR. If we have it, or we have no protections + * active in the block that is skipped when interrupting guest + * context, then it's safe to drop the LFENCE. + */ + if ( !opt_bhb_entry_pv && + (boot_cpu_has(X86_FEATURE_SC_MSR_PV) || + (!boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) && + !boot_cpu_has(X86_FEATURE_SC_RSB_PV))) ) + setup_force_cpu_cap(X86_SPEC_NO_LFENCE_ENTRY_INTR); + + /* + * SPEC_CTRL_ENTRY_FROM_VMX conditional safety + * + * A BHB sequence, if used, is the only conditional action, so if we + * don't have it, we don't need the safety LFENCE. + */ + if ( !opt_bhb_entry_hvm ) + setup_force_cpu_cap(X86_SPEC_NO_LFENCE_ENTRY_VMX); + } + + /* * If MSR_SPEC_CTRL is available, apply Xen's default setting and discard * any firmware settings. For performance reasons, when safe to do so, we * delay applying non-zero settings until after dom0 has been constructed. @@ -1991,7 +2396,7 @@ { info->shadow_spec_ctrl = 0; barrier(); - info->spec_ctrl_flags |= SCF_use_shadow; + info->scf |= SCF_use_shadow; barrier(); } diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/time.c xen-4.17.5/xen/arch/x86/time.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/time.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/time.c 2024-08-14 09:03:57.000000000 +0000 @@ -2288,7 +2288,7 @@ } /* keep pit enabled for pit_broadcast working while cpuidle enabled */ -static int _disable_pit_irq(void(*hpet_broadcast_setup)(void)) +static int _disable_pit_irq(bool init) { int ret = 1; @@ -2303,13 +2303,13 @@ */ if ( cpuidle_using_deep_cstate() && !boot_cpu_has(X86_FEATURE_ARAT) ) { - hpet_broadcast_setup(); + init ? hpet_broadcast_init() : hpet_broadcast_resume(); if ( !hpet_broadcast_is_available() ) { if ( xen_cpuidle > 0 ) { - printk("%ps() failed, turning to PIT broadcast\n", - hpet_broadcast_setup); + printk("hpet_broadcast_%s() failed, turning to PIT broadcast\n", + init ? "init" : "resume"); return -1; } ret = 0; @@ -2326,7 +2326,7 @@ static int __init cf_check disable_pit_irq(void) { - if ( !_disable_pit_irq(hpet_broadcast_init) ) + if ( !_disable_pit_irq(true) ) { xen_cpuidle = 0; printk("CPUIDLE: disabled due to no HPET. " @@ -2387,7 +2387,7 @@ resume_platform_timer(); - if ( !_disable_pit_irq(hpet_broadcast_resume) ) + if ( !_disable_pit_irq(false) ) BUG(); init_percpu_time(); diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/traps.c xen-4.17.5/xen/arch/x86/traps.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/traps.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/traps.c 2024-08-14 09:03:57.000000000 +0000 @@ -856,7 +856,7 @@ } static void fixup_exception_return(struct cpu_user_regs *regs, - unsigned long fixup) + unsigned long fixup, unsigned long stub_ra) { if ( IS_ENABLED(CONFIG_XEN_SHSTK) ) { @@ -873,7 +873,8 @@ /* * Search for %rip. The shstk currently looks like this: * - * ... [Likely pointed to by SSP] + * tok [Supervisor token, == &tok | BUSY, only with FRED inactive] + * ... [Pointed to by SSP for most exceptions, empty in IST cases] * %cs [== regs->cs] * %rip [== regs->rip] * SSP [Likely points to 3 slots higher, above %cs] @@ -891,7 +892,56 @@ */ if ( ptr[0] == regs->rip && ptr[1] == regs->cs ) { + unsigned long primary_shstk = + (ssp & ~(STACK_SIZE - 1)) + + (PRIMARY_SHSTK_SLOT + 1) * PAGE_SIZE - 8; + wrss(fixup, ptr); + + if ( !stub_ra ) + goto shstk_done; + + /* + * Stub recovery ought to happen only when the outer context + * was on the main shadow stack. We need to also "pop" the + * stub's return address from the interrupted context's shadow + * stack. That is, + * - if we're still on the main stack, we need to move the + * entire stack (up to and including the exception frame) + * up by one slot, incrementing the original SSP in the + * exception frame, + * - if we're on an IST stack, we need to increment the + * original SSP. + */ + BUG_ON((ptr[-1] ^ primary_shstk) >> PAGE_SHIFT); + + if ( (ssp ^ primary_shstk) >> PAGE_SHIFT ) + { + /* + * We're on an IST stack. First make sure the two return + * addresses actually match. Then increment the interrupted + * context's SSP. + */ + BUG_ON(stub_ra != *(unsigned long*)ptr[-1]); + wrss(ptr[-1] + 8, &ptr[-1]); + goto shstk_done; + } + + /* Make sure the two return addresses actually match. */ + BUG_ON(stub_ra != ptr[2]); + + /* Move exception frame, updating SSP there. */ + wrss(ptr[1], &ptr[2]); /* %cs */ + wrss(ptr[0], &ptr[1]); /* %rip */ + wrss(ptr[-1] + 8, &ptr[0]); /* SSP */ + + /* Move all newer entries. */ + while ( --ptr != _p(ssp) ) + wrss(ptr[-1], &ptr[0]); + + /* Finally account for our own stack having shifted up. */ + asm volatile ( "incsspd %0" :: "r" (2) ); + goto shstk_done; } } @@ -912,7 +962,8 @@ static bool extable_fixup(struct cpu_user_regs *regs, bool print) { - unsigned long fixup = search_exception_table(regs); + unsigned long stub_ra = 0; + unsigned long fixup = search_exception_table(regs, &stub_ra); if ( unlikely(fixup == 0) ) return false; @@ -926,7 +977,7 @@ vector_name(regs->entry_vector), regs->error_code, _p(regs->rip), _p(regs->rip), _p(fixup)); - fixup_exception_return(regs, fixup); + fixup_exception_return(regs, fixup, stub_ra); this_cpu(last_extable_addr) = regs->rip; return true; @@ -1214,7 +1265,7 @@ void (*fn)(struct cpu_user_regs *) = bug_ptr(bug); fn(regs); - fixup_exception_return(regs, (unsigned long)eip); + fixup_exception_return(regs, (unsigned long)eip, 0); return; } @@ -1235,7 +1286,7 @@ case BUGFRAME_warn: printk("Xen WARN at %s%s:%d\n", prefix, filename, lineno); show_execution_state(regs); - fixup_exception_return(regs, (unsigned long)eip); + fixup_exception_return(regs, (unsigned long)eip, 0); return; case BUGFRAME_bug: diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/tsx.c xen-4.17.5/xen/arch/x86/tsx.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/tsx.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/tsx.c 2024-08-14 09:03:57.000000000 +0000 @@ -1,5 +1,6 @@ #include #include +#include #include /* @@ -9,6 +10,7 @@ * -1 => Default, altered to 0/1 (if unspecified) by: * - TAA heuristics/settings for speculative safety * - "TSX vs PCR3" select for TSX memory ordering safety + * -2 => Implicit tsx=0 (from RTM_ALWAYS_ABORT vs RTM mismatch) * -3 => Implicit tsx=1 (feed-through from spec-ctrl=0) * * This is arranged such that the bottom bit encodes whether TSX is actually @@ -122,11 +124,50 @@ if ( cpu_has_tsx_force_abort ) { + uint64_t val; + /* - * On an early TSX-enable Skylake part subject to the memory + * On an early TSX-enabled Skylake part subject to the memory * ordering erratum, with at least the March 2019 microcode. */ + rdmsrl(MSR_TSX_FORCE_ABORT, val); + + /* + * At the time of writing (April 2024), it was discovered that + * some parts (e.g. CoffeeLake 8th Gen, 06-9e-0a, ucode 0xf6) + * advertise RTM_ALWAYS_ABORT, but XBEGIN instructions #UD. Other + * similar parts (e.g. KabyLake Xeon-E3, 06-9e-09, ucode 0xf8) + * operate as expected. + * + * In this case: + * - RTM_ALWAYS_ABORT and MSR_TSX_FORCE_ABORT are enumerated. + * - XBEGIN instructions genuinely #UD. + * - MSR_TSX_FORCE_ABORT appears to be write-discard and fails to + * hold its value. + * - HLE and RTM are not enumerated, despite + * MSR_TSX_FORCE_ABORT.TSX_CPUID_CLEAR being clear. + * + * Spot RTM being unavailable without CLEAR_CPUID being set, and + * treat it as if no TSX is available at all. This will prevent + * Xen from thinking it's safe to offer HLE/RTM to VMs. + */ + if ( val == 0 && cpu_has_rtm_always_abort && !cpu_has_rtm ) + { + printk(XENLOG_ERR + "FIRMWARE BUG: CPU %02x-%02x-%02x, ucode 0x%08x: RTM_ALWAYS_ABORT vs RTM mismatch\n", + boot_cpu_data.x86, boot_cpu_data.x86_model, + boot_cpu_data.x86_mask, this_cpu(cpu_sig).rev); + + setup_clear_cpu_cap(X86_FEATURE_RTM_ALWAYS_ABORT); + setup_clear_cpu_cap(X86_FEATURE_TSX_FORCE_ABORT); + + if ( opt_tsx < 0 ) + opt_tsx = -2; + + goto done_probe; + } + /* * Probe for the June 2021 microcode which de-features TSX on * client parts. (Note - this is a subset of parts impacted by @@ -136,15 +177,8 @@ * read as zero if TSX_FORCE_ABORT.ENABLE_RTM has been set before * we run. */ - if ( !has_rtm_always_abort ) - { - uint64_t val; - - rdmsrl(MSR_TSX_FORCE_ABORT, val); - - if ( val & TSX_ENABLE_RTM ) - has_rtm_always_abort = true; - } + if ( val & TSX_ENABLE_RTM ) + has_rtm_always_abort = true; /* * If no explicit tsx= option is provided, pick a default. @@ -199,6 +233,7 @@ setup_force_cpu_cap(X86_FEATURE_RTM); } } + done_probe: /* * Note: MSR_TSX_CTRL is enumerated on TSX-enabled MDS_NO and later parts. diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/x86_64/asm-offsets.c xen-4.17.5/xen/arch/x86/x86_64/asm-offsets.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/x86_64/asm-offsets.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/x86_64/asm-offsets.c 2024-08-14 09:03:57.000000000 +0000 @@ -51,6 +51,31 @@ OFFSET(UREGS_kernel_sizeof, struct cpu_user_regs, es); BLANK(); + /* + * EFRAME_* is for the entry/exit logic where %rsp is pointing at + * UREGS_error_code and GPRs are still/already guest values. + */ +#define OFFSET_EF(sym, mem, ...) \ + DEFINE(sym, offsetof(struct cpu_user_regs, mem) - \ + offsetof(struct cpu_user_regs, error_code) __VA_ARGS__) + + OFFSET_EF(EFRAME_entry_vector, entry_vector); + OFFSET_EF(EFRAME_rip, rip); + OFFSET_EF(EFRAME_cs, cs); + OFFSET_EF(EFRAME_eflags, eflags); + + /* + * These aren't real fields. They're spare space, used by the IST + * exit-to-xen path. + */ + OFFSET_EF(EFRAME_shadow_scf, eflags, +4); + OFFSET_EF(EFRAME_shadow_sel, eflags, +6); + + OFFSET_EF(EFRAME_rsp, rsp); + BLANK(); + +#undef OFFSET_EF + OFFSET(VCPU_processor, struct vcpu, processor); OFFSET(VCPU_domain, struct vcpu, domain); OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info); @@ -118,6 +143,8 @@ #endif OFFSET(CPUINFO_guest_cpu_user_regs, struct cpu_info, guest_cpu_user_regs); + OFFSET(CPUINFO_error_code, struct cpu_info, guest_cpu_user_regs.error_code); + OFFSET(CPUINFO_rip, struct cpu_info, guest_cpu_user_regs.rip); OFFSET(CPUINFO_verw_sel, struct cpu_info, verw_sel); OFFSET(CPUINFO_current_vcpu, struct cpu_info, current_vcpu); OFFSET(CPUINFO_per_cpu_offset, struct cpu_info, per_cpu_offset); @@ -127,7 +154,7 @@ OFFSET(CPUINFO_shadow_spec_ctrl, struct cpu_info, shadow_spec_ctrl); OFFSET(CPUINFO_xen_spec_ctrl, struct cpu_info, xen_spec_ctrl); OFFSET(CPUINFO_last_spec_ctrl, struct cpu_info, last_spec_ctrl); - OFFSET(CPUINFO_spec_ctrl_flags, struct cpu_info, spec_ctrl_flags); + OFFSET(CPUINFO_scf, struct cpu_info, scf); OFFSET(CPUINFO_root_pgt_changed, struct cpu_info, root_pgt_changed); OFFSET(CPUINFO_use_pv_cr3, struct cpu_info, use_pv_cr3); DEFINE(CPUINFO_sizeof, sizeof(struct cpu_info)); diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/x86_64/compat/entry.S xen-4.17.5/xen/arch/x86/x86_64/compat/entry.S --- xen-4.17.3+10-g091466ba55/xen/arch/x86/x86_64/compat/entry.S 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/x86_64/compat/entry.S 2024-08-14 09:03:57.000000000 +0000 @@ -15,17 +15,19 @@ ENDBR64 ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP pushq $0 - movl $HYPERCALL_VECTOR, 4(%rsp) + movl $HYPERCALL_VECTOR, EFRAME_entry_vector(%rsp) SAVE_ALL compat=1 /* DPL1 gate, restricted to 32bit PV guests only. */ - SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ + GET_STACK_END(14) + + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %r14=end, %rdx=0, Clob: abcd */ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ sti CR4_PV32_RESTORE - GET_CURRENT(bx) + movq STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx mov %rsp, %rdi call do_entry_int82 @@ -161,6 +163,12 @@ SPEC_CTRL_EXIT_TO_PV /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */ RESTORE_ALL adj=8 compat=1 + + /* Account for ev/ec having already been popped off the stack. */ + SPEC_CTRL_COND_VERW \ + scf=STK_REL(CPUINFO_scf, CPUINFO_rip), \ + sel=STK_REL(CPUINFO_verw_sel, CPUINFO_rip) + .Lft0: iretq _ASM_PRE_EXTABLE(.Lft0, handle_exception) diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/x86_64/entry.S xen-4.17.5/xen/arch/x86/x86_64/entry.S --- xen-4.17.3+10-g091466ba55/xen/arch/x86/x86_64/entry.S 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/x86_64/entry.S 2024-08-14 09:03:57.000000000 +0000 @@ -38,6 +38,14 @@ setc %cl leal (,%rcx,TBF_INTERRUPT),%ecx + /* + * The PV ABI hardcodes the (guest-inaccessible and virtual) + * SYSCALL_MASK MSR such that DF (and nothing else) would be cleared. + * Note that the equivalent of IF (VGCF_syscall_disables_events) is + * dealt with separately above. + */ + mov $~X86_EFLAGS_DF, %esi + test %rax, %rax UNLIKELY_START(z, syscall_no_callback) /* TB_eip == 0 => #UD */ mov VCPU_trap_ctxt(%rbx), %rdi @@ -47,12 +55,14 @@ testb $4, X86_EXC_UD * TRAPINFO_sizeof + TRAPINFO_flags(%rdi) setnz %cl lea TBF_EXCEPTION(, %rcx, TBF_INTERRUPT), %ecx + or $~0, %esi /* Don't clear DF */ UNLIKELY_END(syscall_no_callback) movq %rax,TRAPBOUNCE_eip(%rdx) movb %cl,TRAPBOUNCE_flags(%rdx) call create_bounce_frame - andl $~X86_EFLAGS_DF,UREGS_eflags(%rsp) + /* Conditionally clear DF */ + and %esi, UREGS_eflags(%rsp) /* %rbx: struct vcpu */ test_all_events: ASSERT_NOT_IN_ATOMIC @@ -190,15 +200,15 @@ SPEC_CTRL_EXIT_TO_PV /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */ RESTORE_ALL - testw $TRAP_syscall,4(%rsp) + testw $TRAP_syscall, EFRAME_entry_vector(%rsp) jz iret_exit_to_guest - movq 24(%rsp),%r11 # RFLAGS + mov EFRAME_eflags(%rsp), %r11 andq $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), %r11 orq $X86_EFLAGS_IF,%r11 /* Don't use SYSRET path if the return address is not canonical. */ - movq 8(%rsp),%rcx + mov EFRAME_rip(%rsp), %rcx sarq $47,%rcx incl %ecx cmpl $1,%ecx @@ -213,20 +223,26 @@ ALTERNATIVE "", rag_clrssbsy, X86_FEATURE_XEN_SHSTK #endif - movq 8(%rsp), %rcx # RIP - cmpw $FLAT_USER_CS32,16(%rsp)# CS - movq 32(%rsp),%rsp # RSP + mov EFRAME_rip(%rsp), %rcx + + SPEC_CTRL_COND_VERW /* Req: %rsp=eframe Clob: efl */ + + cmpw $FLAT_USER_CS32, EFRAME_cs(%rsp) + mov EFRAME_rsp(%rsp), %rsp je 1f sysretq 1: sysretl ALIGN .Lrestore_rcx_iret_exit_to_guest: - movq 8(%rsp), %rcx # RIP + mov EFRAME_rip(%rsp), %rcx /* No special register assumptions. */ iret_exit_to_guest: - andl $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), 24(%rsp) - orl $X86_EFLAGS_IF,24(%rsp) + andl $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), EFRAME_eflags(%rsp) + orl $X86_EFLAGS_IF, EFRAME_eflags(%rsp) + + SPEC_CTRL_COND_VERW /* Req: %rsp=eframe Clob: efl */ + addq $8,%rsp .Lft0: iretq _ASM_PRE_EXTABLE(.Lft0, handle_exception) @@ -257,24 +273,25 @@ pushq $FLAT_KERNEL_CS64 pushq %rcx pushq $0 - movl $TRAP_syscall, 4(%rsp) + movl $TRAP_syscall, EFRAME_entry_vector(%rsp) SAVE_ALL - SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ + GET_STACK_END(14) + + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %r14=end, %rdx=0, Clob: abcd */ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - GET_STACK_END(bx) - mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx + mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx test %rcx, %rcx jz .Llstar_cr3_okay - movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%rbx) + movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14) mov %rcx, %cr3 /* %r12 is still zero at this point. */ - mov %r12, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) + mov %r12, STACK_CPUINFO_FIELD(xen_cr3)(%r14) .Llstar_cr3_okay: sti - movq STACK_CPUINFO_FIELD(current_vcpu)(%rbx), %rbx + movq STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx testb $TF_kernel_mode,VCPU_thread_flags(%rbx) jz switch_to_kernel @@ -294,26 +311,27 @@ pushq $FLAT_USER_CS32 pushq %rcx pushq $0 - movl $TRAP_syscall, 4(%rsp) + movl $TRAP_syscall, EFRAME_entry_vector(%rsp) SAVE_ALL - SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ + GET_STACK_END(14) + + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %r14=end, %rdx=0, Clob: abcd */ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - GET_STACK_END(bx) - mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx + mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx test %rcx, %rcx jz .Lcstar_cr3_okay - movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%rbx) + movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14) mov %rcx, %cr3 /* %r12 is still zero at this point. */ - mov %r12, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) + mov %r12, STACK_CPUINFO_FIELD(xen_cr3)(%r14) .Lcstar_cr3_okay: sti CR4_PV32_RESTORE - movq STACK_CPUINFO_FIELD(current_vcpu)(%rbx), %rbx + movq STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx #ifdef CONFIG_PV32 movq VCPU_domain(%rbx), %rcx @@ -335,26 +353,27 @@ pushq $3 /* ring 3 null cs */ pushq $0 /* null rip */ pushq $0 - movl $TRAP_syscall, 4(%rsp) + movl $TRAP_syscall, EFRAME_entry_vector(%rsp) SAVE_ALL - SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ + GET_STACK_END(14) + + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %r14=end, %rdx=0, Clob: abcd */ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - GET_STACK_END(bx) /* PUSHF above has saved EFLAGS.IF clear (the caller had it set). */ orl $X86_EFLAGS_IF, UREGS_eflags(%rsp) - mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx + mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx test %rcx, %rcx jz .Lsyse_cr3_okay - movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%rbx) + movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14) mov %rcx, %cr3 /* %r12 is still zero at this point. */ - mov %r12, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) + mov %r12, STACK_CPUINFO_FIELD(xen_cr3)(%r14) .Lsyse_cr3_okay: sti - movq STACK_CPUINFO_FIELD(current_vcpu)(%rbx), %rbx + movq STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx cmpb $0,VCPU_sysenter_disables_events(%rbx) movq VCPU_sysenter_addr(%rbx),%rax setne %cl @@ -389,20 +408,21 @@ ENDBR64 ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP pushq $0 - movl $0x80, 4(%rsp) + movl $0x80, EFRAME_entry_vector(%rsp) SAVE_ALL - SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ + GET_STACK_END(14) + + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %r14=end, %rdx=0, Clob: abcd */ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - GET_STACK_END(bx) - mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx + mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx test %rcx, %rcx jz .Lint80_cr3_okay - movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%rbx) + movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14) mov %rcx, %cr3 /* %r12 is still zero at this point. */ - mov %r12, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) + mov %r12, STACK_CPUINFO_FIELD(xen_cr3)(%r14) .Lint80_cr3_okay: sti @@ -412,7 +432,7 @@ call check_for_unexpected_msi UNLIKELY_END(msi_check) - movq STACK_CPUINFO_FIELD(current_vcpu)(%rbx), %rbx + movq STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx mov VCPU_trap_ctxt(%rbx), %rsi mov VCPU_domain(%rbx), %rax @@ -627,6 +647,16 @@ #ifdef CONFIG_PV ENTRY(continue_pv_domain) ENDBR64 + + /* + * For speculative type confusion reasons, we're CALLed rather than + * JMPed to. Drop the return address. + */ + add $8, %rsp +#ifdef CONFIG_XEN_SHSTK + ALTERNATIVE "", "mov $2, %eax; incsspd %eax", X86_FEATURE_XEN_SHSTK +#endif + call check_wakeup_from_wait ret_from_intr: GET_CURRENT(bx) @@ -649,7 +679,7 @@ .section .init.text, "ax", @progbits ENTRY(early_page_fault) ENDBR64 - movl $TRAP_page_fault, 4(%rsp) + movl $TRAP_page_fault, EFRAME_entry_vector(%rsp) SAVE_ALL movq %rsp, %rdi call do_early_page_fault @@ -679,9 +709,22 @@ UNLIKELY_END(exit_cr3) /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ - SPEC_CTRL_EXIT_TO_XEN /* Req: %r12=ist_exit %r14=end, Clob: abcd */ + SPEC_CTRL_EXIT_TO_XEN /* Req: %r12=ist_exit %r14=end %rsp=regs, Clob: abcd */ RESTORE_ALL adj=8 + + /* + * When the CPU pushed this exception frame, it zero-extended eflags. + * For an IST exit, SPEC_CTRL_EXIT_TO_XEN stashed shadow copies of + * scf and ver_sel above eflags, as we can't use any GPRs, + * and we're at a random place on the stack, not in a CPUFINFO block. + * + * Account for ev/ec having already been popped off the stack. + */ + SPEC_CTRL_COND_VERW \ + scf=STK_REL(EFRAME_shadow_scf, EFRAME_rip), \ + sel=STK_REL(EFRAME_shadow_sel, EFRAME_rip) + iretq ENTRY(common_interrupt) @@ -690,7 +733,7 @@ GET_STACK_END(14) - SPEC_CTRL_ENTRY_FROM_INTR /* Req: %rsp=regs, %r14=end, %rdx=0, Clob: acd */ + SPEC_CTRL_ENTRY_FROM_INTR /* Req: %rsp=regs, %r14=end, %rdx=0, Clob: abcd */ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx @@ -716,7 +759,7 @@ ENTRY(page_fault) ENDBR64 - movl $TRAP_page_fault,4(%rsp) + movl $TRAP_page_fault, EFRAME_entry_vector(%rsp) /* No special register assumptions. */ GLOBAL(handle_exception) ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP @@ -724,7 +767,7 @@ GET_STACK_END(14) - SPEC_CTRL_ENTRY_FROM_INTR /* Req: %rsp=regs, %r14=end, %rdx=0, Clob: acd */ + SPEC_CTRL_ENTRY_FROM_INTR /* Req: %rsp=regs, %r14=end, %rdx=0, Clob: abcd */ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx @@ -892,90 +935,90 @@ ENTRY(divide_error) ENDBR64 pushq $0 - movl $TRAP_divide_error,4(%rsp) + movl $TRAP_divide_error, EFRAME_entry_vector(%rsp) jmp handle_exception ENTRY(coprocessor_error) ENDBR64 pushq $0 - movl $TRAP_copro_error,4(%rsp) + movl $TRAP_copro_error, EFRAME_entry_vector(%rsp) jmp handle_exception ENTRY(simd_coprocessor_error) ENDBR64 pushq $0 - movl $TRAP_simd_error,4(%rsp) + movl $TRAP_simd_error, EFRAME_entry_vector(%rsp) jmp handle_exception ENTRY(device_not_available) ENDBR64 pushq $0 - movl $TRAP_no_device,4(%rsp) + movl $TRAP_no_device, EFRAME_entry_vector(%rsp) jmp handle_exception ENTRY(debug) ENDBR64 pushq $0 - movl $TRAP_debug,4(%rsp) + movl $TRAP_debug, EFRAME_entry_vector(%rsp) jmp handle_ist_exception ENTRY(int3) ENDBR64 pushq $0 - movl $TRAP_int3,4(%rsp) + movl $TRAP_int3, EFRAME_entry_vector(%rsp) jmp handle_exception ENTRY(overflow) ENDBR64 pushq $0 - movl $TRAP_overflow,4(%rsp) + movl $TRAP_overflow, EFRAME_entry_vector(%rsp) jmp handle_exception ENTRY(bounds) ENDBR64 pushq $0 - movl $TRAP_bounds,4(%rsp) + movl $TRAP_bounds, EFRAME_entry_vector(%rsp) jmp handle_exception ENTRY(invalid_op) ENDBR64 pushq $0 - movl $TRAP_invalid_op,4(%rsp) + movl $TRAP_invalid_op, EFRAME_entry_vector(%rsp) jmp handle_exception ENTRY(invalid_TSS) ENDBR64 - movl $TRAP_invalid_tss,4(%rsp) + movl $TRAP_invalid_tss, EFRAME_entry_vector(%rsp) jmp handle_exception ENTRY(segment_not_present) ENDBR64 - movl $TRAP_no_segment,4(%rsp) + movl $TRAP_no_segment, EFRAME_entry_vector(%rsp) jmp handle_exception ENTRY(stack_segment) ENDBR64 - movl $TRAP_stack_error,4(%rsp) + movl $TRAP_stack_error, EFRAME_entry_vector(%rsp) jmp handle_exception ENTRY(general_protection) ENDBR64 - movl $TRAP_gp_fault,4(%rsp) + movl $TRAP_gp_fault, EFRAME_entry_vector(%rsp) jmp handle_exception ENTRY(alignment_check) ENDBR64 - movl $TRAP_alignment_check,4(%rsp) + movl $TRAP_alignment_check, EFRAME_entry_vector(%rsp) jmp handle_exception ENTRY(entry_CP) ENDBR64 - movl $X86_EXC_CP, 4(%rsp) + movl $X86_EXC_CP, EFRAME_entry_vector(%rsp) jmp handle_exception ENTRY(double_fault) ENDBR64 - movl $TRAP_double_fault,4(%rsp) + movl $TRAP_double_fault, EFRAME_entry_vector(%rsp) /* Set AC to reduce chance of further SMAP faults */ ALTERNATIVE "", stac, X86_FEATURE_XEN_SMAP SAVE_ALL @@ -1001,7 +1044,7 @@ ENTRY(nmi) ENDBR64 pushq $0 - movl $TRAP_nmi,4(%rsp) + movl $TRAP_nmi, EFRAME_entry_vector(%rsp) handle_ist_exception: ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP SAVE_ALL @@ -1134,7 +1177,7 @@ ENTRY(machine_check) ENDBR64 pushq $0 - movl $TRAP_machine_check,4(%rsp) + movl $TRAP_machine_check, EFRAME_entry_vector(%rsp) jmp handle_ist_exception /* No op trap handler. Required for kexec crash path. */ @@ -1171,7 +1214,7 @@ 1: ENDBR64 pushq $0 - movb $vec,4(%rsp) + movb $vec, EFRAME_entry_vector(%rsp) jmp common_interrupt entrypoint 1b @@ -1185,7 +1228,7 @@ test $8,%spl /* 64bit exception frames are 16 byte aligned, but the word */ jz 2f /* size is 8 bytes. Check whether the processor gave us an */ pushq $0 /* error code, and insert an empty one if not. */ -2: movb $vec,4(%rsp) +2: movb $vec, EFRAME_entry_vector(%rsp) jmp handle_exception entrypoint 1b diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/x86_emulate/x86_emulate.c xen-4.17.5/xen/arch/x86/x86_emulate/x86_emulate.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/x86_emulate/x86_emulate.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/x86_emulate/x86_emulate.c 2024-08-14 09:03:57.000000000 +0000 @@ -6829,7 +6829,8 @@ CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x2d): /* vcvts{s,d}2si xmm/mem,reg */ CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x78): /* vcvtts{s,d}2usi xmm/mem,reg */ CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x79): /* vcvts{s,d}2usi xmm/mem,reg */ - generate_exception_if((evex.reg != 0xf || !evex.RX || evex.opmsk || + generate_exception_if((evex.reg != 0xf || !evex.RX || !evex.R || + evex.opmsk || (ea.type != OP_REG && evex.brs)), EXC_UD); host_and_vcpu_must_have(avx512f); @@ -10705,7 +10706,7 @@ goto pextr; case X86EMUL_OPC_EVEX_66(0x0f, 0xc5): /* vpextrw $imm8,xmm,reg */ - generate_exception_if(ea.type != OP_REG, EXC_UD); + generate_exception_if(ea.type != OP_REG || !evex.R, EXC_UD); /* Convert to alternative encoding: We want to use a memory operand. */ evex.opcx = ext_0f3a; b = 0x15; diff -Nru xen-4.17.3+10-g091466ba55/xen/arch/x86/xstate.c xen-4.17.5/xen/arch/x86/xstate.c --- xen-4.17.3+10-g091466ba55/xen/arch/x86/xstate.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/arch/x86/xstate.c 2024-08-14 09:03:57.000000000 +0000 @@ -642,13 +642,6 @@ return; } - /* - * Zap the cached values to make set_xcr0() and set_msr_xss() really - * write it. - */ - this_cpu(xcr0) = 0; - this_cpu(xss) = ~0; - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); feature_mask = (((u64)edx << 32) | eax) & XCNTXT_MASK; BUG_ON(!valid_xcr0(feature_mask)); @@ -658,8 +651,19 @@ * Set CR4_OSXSAVE and run "cpuid" to get xsave_cntxt_size. */ set_in_cr4(X86_CR4_OSXSAVE); + + /* + * Zap the cached values to make set_xcr0() and set_msr_xss() really write + * the hardware register. + */ + this_cpu(xcr0) = 0; if ( !set_xcr0(feature_mask) ) BUG(); + if ( cpu_has_xsaves ) + { + this_cpu(xss) = ~0; + set_msr_xss(0); + } if ( bsp ) { diff -Nru xen-4.17.3+10-g091466ba55/xen/build.mk xen-4.17.5/xen/build.mk --- xen-4.17.3+10-g091466ba55/xen/build.mk 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/build.mk 2024-08-14 09:03:57.000000000 +0000 @@ -1,6 +1,6 @@ quiet_cmd_banner = BANNER $@ define cmd_banner - if which figlet >/dev/null 2>&1 ; then \ + if command -v figlet >/dev/null 2>&1 ; then \ echo " Xen $(XEN_FULLVERSION)" | figlet -f $< > $@.tmp; \ else \ echo " Xen $(XEN_FULLVERSION)" > $@.tmp; \ diff -Nru xen-4.17.3+10-g091466ba55/xen/common/Kconfig xen-4.17.5/xen/common/Kconfig --- xen-4.17.3+10-g091466ba55/xen/common/Kconfig 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/common/Kconfig 2024-08-14 09:03:57.000000000 +0000 @@ -173,6 +173,23 @@ If unsure, say Y. +config SPECULATIVE_HARDEN_LOCK + bool "Speculative lock context hardening" + default y + depends on X86 + help + Contemporary processors may use speculative execution as a + performance optimisation, but this can potentially be abused by an + attacker to leak data via speculative sidechannels. + + One source of data leakage is via speculative accesses to lock + critical regions. + + This option is disabled by default at run time, and needs to be + enabled on the command line. + + If unsure, say Y. + endmenu config DIT_DEFAULT diff -Nru xen-4.17.3+10-g091466ba55/xen/common/bunzip2.c xen-4.17.5/xen/common/bunzip2.c --- xen-4.17.3+10-g091466ba55/xen/common/bunzip2.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/common/bunzip2.c 2024-08-14 09:03:57.000000000 +0000 @@ -221,7 +221,8 @@ RUNB) */ symCount = symTotal+2; for (j = 0; j < groupCount; j++) { - unsigned char length[MAX_SYMBOLS], temp[MAX_HUFCODE_BITS+1]; + unsigned char length[MAX_SYMBOLS]; + unsigned short temp[MAX_HUFCODE_BITS+1]; int minLen, maxLen, pp; /* Read Huffman code lengths for each symbol. They're stored in a way similar to mtf; record a starting diff -Nru xen-4.17.3+10-g091466ba55/xen/common/core_parking.c xen-4.17.5/xen/common/core_parking.c --- xen-4.17.3+10-g091466ba55/xen/common/core_parking.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/common/core_parking.c 2024-08-14 09:03:57.000000000 +0000 @@ -30,10 +30,11 @@ static uint32_t cur_idle_nums; static unsigned int core_parking_cpunum[NR_CPUS] = {[0 ... NR_CPUS-1] = -1}; -static const struct cp_policy { +struct cp_policy { char name[30]; unsigned int (*next)(unsigned int event); -} *__read_mostly core_parking_policy; +}; +static struct cp_policy __ro_after_init core_parking_policy; static enum core_parking_controller { POWER_FIRST, @@ -175,12 +176,13 @@ unsigned int cpu; int ret = 0; - if ( !core_parking_policy ) + if ( !core_parking_policy.next ) return -EINVAL; while ( cur_idle_nums < idle_nums ) { - cpu = core_parking_policy->next(CORE_PARKING_INCREMENT); + cpu = alternative_call(core_parking_policy.next, + CORE_PARKING_INCREMENT); ret = cpu_down(cpu); if ( ret ) return ret; @@ -193,7 +195,8 @@ while ( cur_idle_nums > idle_nums ) { - cpu = core_parking_policy->next(CORE_PARKING_DECREMENT); + cpu = alternative_call(core_parking_policy.next, + CORE_PARKING_DECREMENT); ret = cpu_up(cpu); if ( ret ) return ret; @@ -239,12 +242,12 @@ return cur_idle_nums; } -static const struct cp_policy power_first = { +static const struct cp_policy __initconst_cf_clobber power_first = { .name = "power", .next = core_parking_power, }; -static const struct cp_policy performance_first = { +static const struct cp_policy __initconst_cf_clobber performance_first = { .name = "performance", .next = core_parking_performance, }; @@ -254,7 +257,7 @@ if ( !policy || !policy->next ) return -EINVAL; - core_parking_policy = policy; + core_parking_policy = *policy; return 0; } @@ -269,4 +272,4 @@ return ret; } -__initcall(core_parking_init); +presmp_initcall(core_parking_init); diff -Nru xen-4.17.3+10-g091466ba55/xen/common/cpu.c xen-4.17.5/xen/common/cpu.c --- xen-4.17.3+10-g091466ba55/xen/common/cpu.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/common/cpu.c 2024-08-14 09:03:57.000000000 +0000 @@ -68,6 +68,11 @@ write_unlock(&cpu_add_remove_lock); } +bool cpu_in_hotplug_context(void) +{ + return rw_is_write_locked_by_me(&cpu_add_remove_lock); +} + static NOTIFIER_HEAD(cpu_chain); void __init register_cpu_notifier(struct notifier_block *nb) diff -Nru xen-4.17.3+10-g091466ba55/xen/common/domain.c xen-4.17.5/xen/common/domain.c --- xen-4.17.3+10-g091466ba55/xen/common/domain.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/common/domain.c 2024-08-14 09:03:57.000000000 +0000 @@ -351,7 +351,8 @@ } static unsigned int __read_mostly extra_hwdom_irqs; -static unsigned int __read_mostly extra_domU_irqs = 32; +#define DEFAULT_EXTRA_DOMU_IRQS 32U +static unsigned int __read_mostly extra_domU_irqs = DEFAULT_EXTRA_DOMU_IRQS; static int __init cf_check parse_extra_guest_irqs(const char *s) { @@ -659,7 +660,7 @@ d->nr_pirqs = nr_static_irqs + extra_domU_irqs; else d->nr_pirqs = extra_hwdom_irqs ? nr_static_irqs + extra_hwdom_irqs - : arch_hwdom_irqs(domid); + : arch_hwdom_irqs(d); d->nr_pirqs = min(d->nr_pirqs, nr_irqs); radix_tree_init(&d->pirq_tree); @@ -783,6 +784,25 @@ if ( IS_ERR(dom_xen) ) panic("Failed to create d[XEN]: %ld\n", PTR_ERR(dom_xen)); +#ifdef CONFIG_HAS_PIRQ + /* Bound-check values passed via "extra_guest_irqs=". */ + { + unsigned int n = max(arch_hwdom_irqs(dom_xen), nr_static_irqs); + + if ( extra_hwdom_irqs > n - nr_static_irqs ) + { + extra_hwdom_irqs = n - nr_static_irqs; + printk(XENLOG_WARNING "hwdom IRQs bounded to %u\n", n); + } + if ( extra_domU_irqs > + max(DEFAULT_EXTRA_DOMU_IRQS, n - nr_static_irqs) ) + { + extra_domU_irqs = n - nr_static_irqs; + printk(XENLOG_WARNING "domU IRQs bounded to %u\n", n); + } + } +#endif + /* * Initialise our DOMID_IO domain. * This domain owns I/O pages that are within the range of the page_info diff -Nru xen-4.17.3+10-g091466ba55/xen/common/domctl.c xen-4.17.5/xen/common/domctl.c --- xen-4.17.3+10-g091466ba55/xen/common/domctl.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/common/domctl.c 2024-08-14 09:03:57.000000000 +0000 @@ -318,7 +318,9 @@ return -ESRCH; } - ret = xsm_domctl(XSM_OTHER, d, op->cmd); + ret = xsm_domctl(XSM_OTHER, d, op->cmd, + /* SSIDRef only applicable for cmd == createdomain */ + op->u.createdomain.ssidref); if ( ret ) goto domctl_out_unlock_domonly; diff -Nru xen-4.17.3+10-g091466ba55/xen/common/event_channel.c xen-4.17.5/xen/common/event_channel.c --- xen-4.17.3+10-g091466ba55/xen/common/event_channel.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/common/event_channel.c 2024-08-14 09:03:57.000000000 +0000 @@ -62,7 +62,7 @@ * just assume the event channel is free or unbound at the moment when the * evtchn_read_trylock() returns false. */ -static inline void evtchn_write_lock(struct evtchn *evtchn) +static always_inline void evtchn_write_lock(struct evtchn *evtchn) { write_lock(&evtchn->lock); @@ -364,7 +364,8 @@ return rc; } -static void double_evtchn_lock(struct evtchn *lchn, struct evtchn *rchn) +static always_inline void double_evtchn_lock(struct evtchn *lchn, + struct evtchn *rchn) { ASSERT(lchn != rchn); @@ -617,7 +618,9 @@ if ( rc != 0 ) { info->evtchn = 0; +#ifdef CONFIG_X86 pirq_cleanup_check(info, d); +#endif goto out; } @@ -679,10 +682,15 @@ if ( !is_hvm_domain(d1) ) pirq_guest_unbind(d1, pirq); pirq->evtchn = 0; - pirq_cleanup_check(pirq, d1); #ifdef CONFIG_X86 - if ( is_hvm_domain(d1) && domain_pirq_to_irq(d1, pirq->pirq) > 0 ) - unmap_domain_pirq_emuirq(d1, pirq->pirq); + if ( !is_hvm_domain(d1) || + domain_pirq_to_irq(d1, pirq->pirq) <= 0 || + unmap_domain_pirq_emuirq(d1, pirq->pirq) < 0 ) + /* + * The successful path of unmap_domain_pirq_emuirq() will have + * called pirq_cleanup_check() already. + */ + pirq_cleanup_check(pirq, d1); #endif } unlink_pirq_port(chn1, d1->vcpu[chn1->notify_vcpu_id]); diff -Nru xen-4.17.3+10-g091466ba55/xen/common/grant_table.c xen-4.17.5/xen/common/grant_table.c --- xen-4.17.3+10-g091466ba55/xen/common/grant_table.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/common/grant_table.c 2024-08-14 09:03:57.000000000 +0000 @@ -410,7 +410,7 @@ static DEFINE_PERCPU_RWLOCK_GLOBAL(grant_rwlock); -static inline void grant_read_lock(struct grant_table *gt) +static always_inline void grant_read_lock(struct grant_table *gt) { percpu_read_lock(grant_rwlock, >->lock); } @@ -420,7 +420,7 @@ percpu_read_unlock(grant_rwlock, >->lock); } -static inline void grant_write_lock(struct grant_table *gt) +static always_inline void grant_write_lock(struct grant_table *gt) { percpu_write_lock(grant_rwlock, >->lock); } @@ -457,7 +457,7 @@ return num_act_frames_from_sha_frames(nr_grant_frames(gt)); } -static inline struct active_grant_entry * +static always_inline struct active_grant_entry * active_entry_acquire(struct grant_table *t, grant_ref_t e) { struct active_grant_entry *act; diff -Nru xen-4.17.3+10-g091466ba55/xen/common/irq.c xen-4.17.5/xen/common/irq.c --- xen-4.17.3+10-g091466ba55/xen/common/irq.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/common/irq.c 2024-08-14 09:03:57.000000000 +0000 @@ -1,6 +1,8 @@ #include #include +DEFINE_PER_CPU(struct cpu_user_regs *, irq_regs); + int init_one_irq_desc(struct irq_desc *desc) { int err; diff -Nru xen-4.17.3+10-g091466ba55/xen/common/livepatch.c xen-4.17.5/xen/common/livepatch.c --- xen-4.17.3+10-g091466ba55/xen/common/livepatch.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/common/livepatch.c 2024-08-14 09:03:57.000000000 +0000 @@ -36,13 +36,14 @@ * caller in schedule_work. */ static DEFINE_SPINLOCK(payload_lock); -static LIST_HEAD(payload_list); - /* - * Patches which have been applied. Need RCU in case we crash (and then - * traps code would iterate via applied_list) when adding entries on the list. + * Need RCU in case we crash (and then traps code would iterate via + * payload_list) when adding entries on the list. */ -static DEFINE_RCU_READ_LOCK(rcu_applied_lock); +static DEFINE_RCU_READ_LOCK(rcu_payload_lock); +static LIST_HEAD(payload_list); + +/* Patches which have been applied. Only modified from stop machine context. */ static LIST_HEAD(applied_list); static unsigned int payload_cnt; @@ -111,12 +112,8 @@ const struct payload *data; bool_t r = 0; - /* - * Only RCU locking since this list is only ever changed during apply - * or revert context. And in case it dies there we need an safe list. - */ - rcu_read_lock(&rcu_applied_lock); - list_for_each_entry_rcu ( data, &applied_list, applied_list ) + rcu_read_lock(&rcu_payload_lock); + list_for_each_entry_rcu ( data, &payload_list, list ) { if ( (ptr >= data->rw_addr && ptr < (data->rw_addr + data->rw_size)) || @@ -130,7 +127,7 @@ } } - rcu_read_unlock(&rcu_applied_lock); + rcu_read_unlock(&rcu_payload_lock); return r; } @@ -166,12 +163,8 @@ const void *va = (const void *)addr; const char *n = NULL; - /* - * Only RCU locking since this list is only ever changed during apply - * or revert context. And in case it dies there we need an safe list. - */ - rcu_read_lock(&rcu_applied_lock); - list_for_each_entry_rcu ( data, &applied_list, applied_list ) + rcu_read_lock(&rcu_payload_lock); + list_for_each_entry_rcu ( data, &payload_list, list ) { if ( va < data->text_addr || va >= (data->text_addr + data->text_size) ) @@ -200,7 +193,7 @@ n = data->symtab[best].name; break; } - rcu_read_unlock(&rcu_applied_lock); + rcu_read_unlock(&rcu_payload_lock); return n; } @@ -792,8 +785,14 @@ region = &payload->region; region->symbols_lookup = livepatch_symbols_lookup; - region->start = payload->text_addr; - region->end = payload->text_addr + payload->text_size; + region->text_start = payload->text_addr; + region->text_end = payload->text_addr + payload->text_size; + + if ( payload->ro_size ) + { + region->rodata_start = payload->ro_addr; + region->rodata_end = payload->ro_addr + payload->ro_size; + } /* Optional sections. */ for ( i = 0; i < BUGFRAME_NR; i++ ) @@ -817,28 +816,84 @@ if ( sec ) { #ifdef CONFIG_HAS_ALTERNATIVE + /* + * (As of April 2023), Alternatives are formed of: + * - An .altinstructions section with an array of struct alt_instr's. + * - An .altinstr_replacement section containing instructions. + * + * An individual alt_instr contains: + * - An orig reference, pointing into .text with a nonzero length + * - A repl reference, pointing into .altinstr_replacement + * + * It is legal to have zero-length replacements, meaning it is legal + * for the .altinstr_replacement section to be empty too. An + * implementation detail means that a zero-length replacement's repl + * reference will still be in the .altinstr_replacement section. + */ + const struct livepatch_elf_sec *repl_sec; struct alt_instr *a, *start, *end; if ( !section_ok(elf, sec, sizeof(*a)) ) return -EINVAL; + /* Tolerate an empty .altinstructions section... */ + if ( sec->sec->sh_size == 0 ) + goto alt_done; + + /* ... but otherwise, there needs to be something to alter... */ + if ( payload->text_size == 0 ) + { + printk(XENLOG_ERR LIVEPATCH "%s Alternatives provided, but no .text\n", + elf->name); + return -EINVAL; + } + + /* ... and something to be altered to. */ + repl_sec = livepatch_elf_sec_by_name(elf, ".altinstr_replacement"); + if ( !repl_sec ) + { + printk(XENLOG_ERR LIVEPATCH "%s .altinstructions provided, but no .altinstr_replacement\n", + elf->name); + return -EINVAL; + } + start = sec->load_addr; end = sec->load_addr + sec->sec->sh_size; for ( a = start; a < end; a++ ) { - const void *instr = ALT_ORIG_PTR(a); - const void *replacement = ALT_REPL_PTR(a); + const void *orig = ALT_ORIG_PTR(a); + const void *repl = ALT_REPL_PTR(a); - if ( (instr < region->start && instr >= region->end) || - (replacement < region->start && replacement >= region->end) ) + /* orig must be fully within .text. */ + if ( orig < payload->text_addr || + a->orig_len > payload->text_size || + orig + a->orig_len > payload->text_addr + payload->text_size ) { - printk(XENLOG_ERR LIVEPATCH "%s Alt patching outside payload: %p\n", - elf->name, instr); + printk(XENLOG_ERR LIVEPATCH + "%s Alternative orig %p+%#x outside payload text %p+%#zx\n", + elf->name, orig, a->orig_len, + payload->text_addr, payload->text_size); + return -EINVAL; + } + + /* + * repl must be fully within .altinstr_replacement, even if the + * replacement and the section happen to both have zero length. + */ + if ( repl < repl_sec->load_addr || + a->repl_len > repl_sec->sec->sh_size || + repl + a->repl_len > repl_sec->load_addr + repl_sec->sec->sh_size ) + { + printk(XENLOG_ERR LIVEPATCH + "%s Alternative repl %p+%#x outside .altinstr_replacement %p+%#"PRIxElfWord"\n", + elf->name, repl, a->repl_len, + repl_sec->load_addr, repl_sec->sec->sh_size); return -EINVAL; } } apply_alternatives(start, end); + alt_done:; #else printk(XENLOG_ERR LIVEPATCH "%s: We don't support alternative patching\n", elf->name); @@ -1015,7 +1070,9 @@ static void free_payload(struct payload *data) { ASSERT(spin_is_locked(&payload_lock)); - list_del(&data->list); + unregister_virtual_region(&data->region); + list_del_rcu(&data->list); + rcu_barrier(); payload_cnt--; payload_version++; free_payload_data(data); @@ -1114,7 +1171,8 @@ INIT_LIST_HEAD(&data->list); INIT_LIST_HEAD(&data->applied_list); - list_add_tail(&data->list, &payload_list); + register_virtual_region(&data->region); + list_add_tail_rcu(&data->list, &payload_list); payload_cnt++; payload_version++; } @@ -1314,7 +1372,22 @@ ASSERT(!local_irq_is_enabled()); for ( i = 0; i < data->nfuncs; i++ ) - common_livepatch_apply(&data->funcs[i], &data->fstate[i]); + { + const struct livepatch_func *func = &data->funcs[i]; + struct livepatch_fstate *state = &data->fstate[i]; + + /* If the action has been already executed on this function, do nothing. */ + if ( state->applied == LIVEPATCH_FUNC_APPLIED ) + { + printk(XENLOG_WARNING LIVEPATCH + "%s: %s has been already applied before\n", + __func__, func->name); + continue; + } + + arch_livepatch_apply(func, state); + state->applied = LIVEPATCH_FUNC_APPLIED; + } arch_livepatch_revive(); @@ -1325,17 +1398,12 @@ static inline void apply_payload_tail(struct payload *data) { - /* - * We need RCU variant (which has barriers) in case we crash here. - * The applied_list is iterated by the trap code. - */ - list_add_tail_rcu(&data->applied_list, &applied_list); - register_virtual_region(&data->region); + list_add_tail(&data->applied_list, &applied_list); data->state = LIVEPATCH_STATE_APPLIED; } -static int revert_payload(struct payload *data) +int revert_payload(struct payload *data) { unsigned int i; int rc; @@ -1350,7 +1418,25 @@ } for ( i = 0; i < data->nfuncs; i++ ) - common_livepatch_revert(&data->funcs[i], &data->fstate[i]); + { + const struct livepatch_func *func = &data->funcs[i]; + struct livepatch_fstate *state = &data->fstate[i]; + + /* + * If the apply action hasn't been executed on this function, do + * nothing. + */ + if ( !func->old_addr || state->applied == LIVEPATCH_FUNC_NOT_APPLIED ) + { + printk(XENLOG_WARNING LIVEPATCH + "%s: %s has not been applied before\n", + __func__, func->name); + continue; + } + + arch_livepatch_revert(func, state); + state->applied = LIVEPATCH_FUNC_NOT_APPLIED; + } /* * Since we are running with IRQs disabled and the hooks may call common @@ -1368,15 +1454,9 @@ return 0; } -static inline void revert_payload_tail(struct payload *data) +void revert_payload_tail(struct payload *data) { - - /* - * We need RCU variant (which has barriers) in case we crash here. - * The applied_list is iterated by the trap code. - */ - list_del_rcu(&data->applied_list); - unregister_virtual_region(&data->region); + list_del(&data->applied_list); data->reverted = true; data->state = LIVEPATCH_STATE_CHECKED; diff -Nru xen-4.17.3+10-g091466ba55/xen/common/rwlock.c xen-4.17.5/xen/common/rwlock.c --- xen-4.17.3+10-g091466ba55/xen/common/rwlock.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/common/rwlock.c 2024-08-14 09:03:57.000000000 +0000 @@ -34,8 +34,11 @@ /* * Put the reader into the wait queue. + * + * Use the speculation unsafe helper, as it's the caller responsibility to + * issue a speculation barrier if required. */ - spin_lock(&lock->lock); + _spin_lock(&lock->lock); /* * At the head of the wait queue now, wait until the writer state @@ -64,8 +67,13 @@ { u32 cnts; - /* Put the writer into the wait queue. */ - spin_lock(&lock->lock); + /* + * Put the writer into the wait queue. + * + * Use the speculation unsafe helper, as it's the caller responsibility to + * issue a speculation barrier if required. + */ + _spin_lock(&lock->lock); /* Try to acquire the lock directly if no reader is present. */ if ( !atomic_read(&lock->cnts) && @@ -117,8 +125,12 @@ /* * First take the write lock to protect against other writers or slow * path readers. + * + * Note we use the speculation unsafe variant of write_lock(), as the + * calling wrapper already adds a speculation barrier after the lock has + * been taken. */ - write_lock(&percpu_rwlock->rwlock); + _write_lock(&percpu_rwlock->rwlock); /* Now set the global variable so that readers start using read_lock. */ percpu_rwlock->writer_activating = 1; diff -Nru xen-4.17.3+10-g091466ba55/xen/common/sched/compat.c xen-4.17.5/xen/common/sched/compat.c --- xen-4.17.3+10-g091466ba55/xen/common/sched/compat.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/common/sched/compat.c 2024-08-14 09:03:57.000000000 +0000 @@ -39,9 +39,9 @@ #include "core.c" -int compat_set_timer_op(u32 lo, s32 hi) +int compat_set_timer_op(uint32_t lo, uint32_t hi) { - return do_set_timer_op(((s64)hi << 32) | lo); + return do_set_timer_op(((uint64_t)hi << 32) | lo); } /* diff -Nru xen-4.17.3+10-g091466ba55/xen/common/sched/core.c xen-4.17.5/xen/common/sched/core.c --- xen-4.17.3+10-g091466ba55/xen/common/sched/core.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/common/sched/core.c 2024-08-14 09:03:57.000000000 +0000 @@ -348,23 +348,28 @@ * This avoids dead- or live-locks when this code is running on both * cpus at the same time. */ -static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2, - unsigned long *flags) +static always_inline void sched_spin_lock_double( + spinlock_t *lock1, spinlock_t *lock2, unsigned long *flags) { + /* + * In order to avoid extra overhead, use the locking primitives without the + * speculation barrier, and introduce a single barrier here. + */ if ( lock1 == lock2 ) { - spin_lock_irqsave(lock1, *flags); + *flags = _spin_lock_irqsave(lock1); } else if ( lock1 < lock2 ) { - spin_lock_irqsave(lock1, *flags); - spin_lock(lock2); + *flags = _spin_lock_irqsave(lock1); + _spin_lock(lock2); } else { - spin_lock_irqsave(lock2, *flags); - spin_lock(lock1); + *flags = _spin_lock_irqsave(lock2); + _spin_lock(lock1); } + block_lock_speculation(); } static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2, @@ -2755,6 +2760,36 @@ return sr; } +static void cf_check sched_res_free(struct rcu_head *head) +{ + struct sched_resource *sr = container_of(head, struct sched_resource, rcu); + + free_cpumask_var(sr->cpus); + if ( sr->sched_unit_idle ) + sched_free_unit_mem(sr->sched_unit_idle); + xfree(sr); +} + +static void cpu_schedule_down(unsigned int cpu) +{ + struct sched_resource *sr; + + rcu_read_lock(&sched_res_rculock); + + sr = get_sched_res(cpu); + + kill_timer(&sr->s_timer); + + cpumask_clear_cpu(cpu, &sched_res_mask); + set_sched_res(cpu, NULL); + + /* Keep idle unit. */ + sr->sched_unit_idle = NULL; + call_rcu(&sr->rcu, sched_res_free); + + rcu_read_unlock(&sched_res_rculock); +} + static int cpu_schedule_up(unsigned int cpu) { struct sched_resource *sr; @@ -2794,7 +2829,10 @@ idle_vcpu[cpu]->sched_unit->res = sr; if ( idle_vcpu[cpu] == NULL ) + { + cpu_schedule_down(cpu); return -ENOMEM; + } idle_vcpu[cpu]->sched_unit->rendezvous_in_cnt = 0; @@ -2812,36 +2850,6 @@ return 0; } -static void cf_check sched_res_free(struct rcu_head *head) -{ - struct sched_resource *sr = container_of(head, struct sched_resource, rcu); - - free_cpumask_var(sr->cpus); - if ( sr->sched_unit_idle ) - sched_free_unit_mem(sr->sched_unit_idle); - xfree(sr); -} - -static void cpu_schedule_down(unsigned int cpu) -{ - struct sched_resource *sr; - - rcu_read_lock(&sched_res_rculock); - - sr = get_sched_res(cpu); - - kill_timer(&sr->s_timer); - - cpumask_clear_cpu(cpu, &sched_res_mask); - set_sched_res(cpu, NULL); - - /* Keep idle unit. */ - sr->sched_unit_idle = NULL; - call_rcu(&sr->rcu, sched_res_free); - - rcu_read_unlock(&sched_res_rculock); -} - void sched_rm_cpu(unsigned int cpu) { int rc; @@ -3174,6 +3182,8 @@ sr->scheduler = new_ops; sr->sched_priv = ppriv; + sr->granularity = cpupool_get_granularity(c); + sr->cpupool = c; /* * Reroute the lock to the per pCPU lock as /last/ thing. In fact, @@ -3186,8 +3196,6 @@ /* _Not_ pcpu_schedule_unlock(): schedule_lock has changed! */ spin_unlock_irqrestore(old_lock, flags); - sr->granularity = cpupool_get_granularity(c); - sr->cpupool = c; /* The cpu is added to a pool, trigger it to go pick up some work */ cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); diff -Nru xen-4.17.3+10-g091466ba55/xen/common/sched/private.h xen-4.17.5/xen/common/sched/private.h --- xen-4.17.3+10-g091466ba55/xen/common/sched/private.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/common/sched/private.h 2024-08-14 09:03:57.000000000 +0000 @@ -207,8 +207,24 @@ #define cpumask_scratch (&this_cpu(cpumask_scratch)) #define cpumask_scratch_cpu(c) (&per_cpu(cpumask_scratch, c)) +/* + * Deal with _spin_lock_irqsave() returning the flags value instead of storing + * it in a passed parameter. + */ +#define _sched_spinlock0(lock, irq) _spin_lock##irq(lock) +#define _sched_spinlock1(lock, irq, arg) ({ \ + BUILD_BUG_ON(sizeof(arg) != sizeof(unsigned long)); \ + (arg) = _spin_lock##irq(lock); \ +}) + +#define _sched_spinlock__(nr) _sched_spinlock ## nr +#define _sched_spinlock_(nr) _sched_spinlock__(nr) +#define _sched_spinlock(lock, irq, args...) \ + _sched_spinlock_(count_args(args))(lock, irq, ## args) + #define sched_lock(kind, param, cpu, irq, arg...) \ -static inline spinlock_t *kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \ +static always_inline spinlock_t \ +*kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \ { \ for ( ; ; ) \ { \ @@ -220,10 +236,16 @@ * \ * It may also be the case that v->processor may change but the \ * lock may be the same; this will succeed in that case. \ + * \ + * Use the speculation unsafe locking helper, there's a speculation \ + * barrier before returning to the caller. \ */ \ - spin_lock##irq(lock, ## arg); \ + _sched_spinlock(lock, irq, ## arg); \ if ( likely(lock == get_sched_res(cpu)->schedule_lock) ) \ + { \ + block_lock_speculation(); \ return lock; \ + } \ spin_unlock##irq(lock, ## arg); \ } \ } diff -Nru xen-4.17.3+10-g091466ba55/xen/common/timer.c xen-4.17.5/xen/common/timer.c --- xen-4.17.3+10-g091466ba55/xen/common/timer.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/common/timer.c 2024-08-14 09:03:57.000000000 +0000 @@ -240,7 +240,7 @@ list_add(&timer->inactive, &per_cpu(timers, timer->cpu).inactive); } -static inline bool_t timer_lock(struct timer *timer) +static inline bool_t timer_lock_unsafe(struct timer *timer) { unsigned int cpu; @@ -254,7 +254,8 @@ rcu_read_unlock(&timer_cpu_read_lock); return 0; } - spin_lock(&per_cpu(timers, cpu).lock); + /* Use the speculation unsafe variant, the wrapper has the barrier. */ + _spin_lock(&per_cpu(timers, cpu).lock); if ( likely(timer->cpu == cpu) ) break; spin_unlock(&per_cpu(timers, cpu).lock); @@ -267,8 +268,9 @@ #define timer_lock_irqsave(t, flags) ({ \ bool_t __x; \ local_irq_save(flags); \ - if ( !(__x = timer_lock(t)) ) \ + if ( !(__x = timer_lock_unsafe(t)) ) \ local_irq_restore(flags); \ + block_lock_speculation(); \ __x; \ }) diff -Nru xen-4.17.3+10-g091466ba55/xen/common/ubsan/ubsan.h xen-4.17.5/xen/common/ubsan/ubsan.h --- xen-4.17.3+10-g091466ba55/xen/common/ubsan/ubsan.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/common/ubsan/ubsan.h 2024-08-14 09:03:57.000000000 +0000 @@ -10,7 +10,7 @@ struct type_descriptor { u16 type_kind; u16 type_info; - char type_name[1]; + char type_name[]; }; struct source_location { diff -Nru xen-4.17.3+10-g091466ba55/xen/common/virtual_region.c xen-4.17.5/xen/common/virtual_region.c --- xen-4.17.3+10-g091466ba55/xen/common/virtual_region.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/common/virtual_region.c 2024-08-14 09:03:57.000000000 +0000 @@ -11,26 +11,22 @@ static struct virtual_region core = { .list = LIST_HEAD_INIT(core.list), - .start = _stext, - .end = _etext, + .text_start = _stext, + .text_end = _etext, + .rodata_start = _srodata, + .rodata_end = _erodata, }; /* Becomes irrelevant when __init sections are cleared. */ static struct virtual_region core_init __initdata = { .list = LIST_HEAD_INIT(core_init.list), - .start = _sinittext, - .end = _einittext, + .text_start = _sinittext, + .text_end = _einittext, }; /* - * RCU locking. Additions are done either at startup (when there is only - * one CPU) or when all CPUs are running without IRQs. - * - * Deletions are bit tricky. We do it when Live Patch (all CPUs running - * without IRQs) or during bootup (when clearing the init). - * - * Hence we use list_del_rcu (which sports an memory fence) and a spinlock - * on deletion. + * RCU locking. Modifications to the list must be done in exclusive mode, and + * hence need to hold the spinlock. * * All readers of virtual_region_list MUST use list_for_each_entry_rcu. */ @@ -45,7 +41,8 @@ rcu_read_lock(&rcu_virtual_region_lock); list_for_each_entry_rcu( region, &virtual_region_list, list ) { - if ( (void *)addr >= region->start && (void *)addr < region->end ) + if ( (void *)addr >= region->text_start && + (void *)addr < region->text_end ) { rcu_read_unlock(&rcu_virtual_region_lock); return region; @@ -58,50 +55,51 @@ void register_virtual_region(struct virtual_region *r) { - ASSERT(!local_irq_is_enabled()); + unsigned long flags; + spin_lock_irqsave(&virtual_region_lock, flags); list_add_tail_rcu(&r->list, &virtual_region_list); + spin_unlock_irqrestore(&virtual_region_lock, flags); } -static void remove_virtual_region(struct virtual_region *r) +/* + * Suggest inline so when !CONFIG_LIVEPATCH the function is not left + * unreachable after init code is removed. + */ +static void inline remove_virtual_region(struct virtual_region *r) { unsigned long flags; spin_lock_irqsave(&virtual_region_lock, flags); list_del_rcu(&r->list); spin_unlock_irqrestore(&virtual_region_lock, flags); - /* - * We do not need to invoke call_rcu. - * - * This is due to the fact that on the deletion we have made sure - * to use spinlocks (to guard against somebody else calling - * unregister_virtual_region) and list_deletion spiced with - * memory barrier. - * - * That protects us from corrupting the list as the readers all - * use list_for_each_entry_rcu which is safe against concurrent - * deletions. - */ } +#ifdef CONFIG_LIVEPATCH void unregister_virtual_region(struct virtual_region *r) { - /* Expected to be called from Live Patch - which has IRQs disabled. */ - ASSERT(!local_irq_is_enabled()); - remove_virtual_region(r); + + /* Assert that no CPU might be using the removed region. */ + rcu_barrier(); } -#if defined(CONFIG_LIVEPATCH) && defined(CONFIG_X86) +#ifdef CONFIG_X86 void relax_virtual_region_perms(void) { const struct virtual_region *region; rcu_read_lock(&rcu_virtual_region_lock); list_for_each_entry_rcu( region, &virtual_region_list, list ) - modify_xen_mappings_lite((unsigned long)region->start, - ROUNDUP((unsigned long)region->end, PAGE_SIZE), + { + modify_xen_mappings_lite((unsigned long)region->text_start, + PAGE_ALIGN((unsigned long)region->text_end), PAGE_HYPERVISOR_RWX); + if ( region->rodata_start ) + modify_xen_mappings_lite((unsigned long)region->rodata_start, + PAGE_ALIGN((unsigned long)region->rodata_end), + PAGE_HYPERVISOR_RW); + } rcu_read_unlock(&rcu_virtual_region_lock); } @@ -111,12 +109,19 @@ rcu_read_lock(&rcu_virtual_region_lock); list_for_each_entry_rcu( region, &virtual_region_list, list ) - modify_xen_mappings_lite((unsigned long)region->start, - ROUNDUP((unsigned long)region->end, PAGE_SIZE), + { + modify_xen_mappings_lite((unsigned long)region->text_start, + PAGE_ALIGN((unsigned long)region->text_end), PAGE_HYPERVISOR_RX); + if ( region->rodata_start ) + modify_xen_mappings_lite((unsigned long)region->rodata_start, + PAGE_ALIGN((unsigned long)region->rodata_end), + PAGE_HYPERVISOR_RO); + } rcu_read_unlock(&rcu_virtual_region_lock); } -#endif +#endif /* CONFIG_X86 */ +#endif /* CONFIG_LIVEPATCH */ void __init unregister_init_virtual_region(void) { diff -Nru xen-4.17.3+10-g091466ba55/xen/drivers/acpi/pmstat.c xen-4.17.5/xen/drivers/acpi/pmstat.c --- xen-4.17.3+10-g091466ba55/xen/drivers/acpi/pmstat.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/drivers/acpi/pmstat.c 2024-08-14 09:03:57.000000000 +0000 @@ -255,7 +255,8 @@ return ret; op->u.get_para.cpuinfo_cur_freq = - cpufreq_driver.get ? cpufreq_driver.get(op->cpuid) : policy->cur; + cpufreq_driver.get ? alternative_call(cpufreq_driver.get, op->cpuid) + : policy->cur; op->u.get_para.cpuinfo_max_freq = policy->cpuinfo.max_freq; op->u.get_para.cpuinfo_min_freq = policy->cpuinfo.min_freq; op->u.get_para.scaling_cur_freq = policy->cur; diff -Nru xen-4.17.3+10-g091466ba55/xen/drivers/cpufreq/cpufreq.c xen-4.17.5/xen/drivers/cpufreq/cpufreq.c --- xen-4.17.3+10-g091466ba55/xen/drivers/cpufreq/cpufreq.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/drivers/cpufreq/cpufreq.c 2024-08-14 09:03:57.000000000 +0000 @@ -240,7 +240,7 @@ policy->cpu = cpu; per_cpu(cpufreq_cpu_policy, cpu) = policy; - ret = cpufreq_driver.init(policy); + ret = alternative_call(cpufreq_driver.init, policy); if (ret) { free_cpumask_var(policy->cpus); xfree(policy); @@ -299,7 +299,7 @@ cpumask_clear_cpu(cpu, cpufreq_dom->map); if (cpumask_empty(policy->cpus)) { - cpufreq_driver.exit(policy); + alternative_call(cpufreq_driver.exit, policy); free_cpumask_var(policy->cpus); xfree(policy); } @@ -363,7 +363,7 @@ cpumask_clear_cpu(cpu, cpufreq_dom->map); if (cpumask_empty(policy->cpus)) { - cpufreq_driver.exit(policy); + alternative_call(cpufreq_driver.exit, policy); free_cpumask_var(policy->cpus); xfree(policy); } diff -Nru xen-4.17.3+10-g091466ba55/xen/drivers/cpufreq/utility.c xen-4.17.5/xen/drivers/cpufreq/utility.c --- xen-4.17.3+10-g091466ba55/xen/drivers/cpufreq/utility.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/drivers/cpufreq/utility.c 2024-08-14 09:03:57.000000000 +0000 @@ -412,7 +412,7 @@ policy->turbo = new_state; if (cpufreq_driver.update) { - ret = cpufreq_driver.update(cpuid, policy); + ret = alternative_call(cpufreq_driver.update, cpuid, policy); if (ret) policy->turbo = curr_state; } @@ -448,7 +448,7 @@ return -EINVAL; /* verify the cpu speed can be set within this limit */ - ret = cpufreq_driver.verify(policy); + ret = alternative_call(cpufreq_driver.verify, policy); if (ret) return ret; @@ -456,7 +456,7 @@ data->max = policy->max; data->limits = policy->limits; if (cpufreq_driver.setpolicy) - return cpufreq_driver.setpolicy(data); + return alternative_call(cpufreq_driver.setpolicy, data); if (policy->governor != data->governor) { /* save old, working values */ diff -Nru xen-4.17.3+10-g091466ba55/xen/drivers/passthrough/amd/iommu_acpi.c xen-4.17.5/xen/drivers/passthrough/amd/iommu_acpi.c --- xen-4.17.3+10-g091466ba55/xen/drivers/passthrough/amd/iommu_acpi.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/drivers/passthrough/amd/iommu_acpi.c 2024-08-14 09:03:57.000000000 +0000 @@ -426,9 +426,14 @@ return -EIO; } - /* Types which won't be handed out are considered good enough. */ - if ( !(type & (RAM_TYPE_RESERVED | RAM_TYPE_ACPI | - RAM_TYPE_UNUSABLE)) ) + /* + * Types which aren't RAM are considered good enough. + * Note that a page being partially RESERVED, ACPI or UNUSABLE will + * force Xen into assuming the whole page as having that type in + * practice. + */ + if ( type & (RAM_TYPE_RESERVED | RAM_TYPE_ACPI | + RAM_TYPE_UNUSABLE) ) continue; AMD_IOMMU_ERROR("IVMD: page at %lx can't be converted\n", addr); diff -Nru xen-4.17.3+10-g091466ba55/xen/drivers/passthrough/pci.c xen-4.17.5/xen/drivers/passthrough/pci.c --- xen-4.17.3+10-g091466ba55/xen/drivers/passthrough/pci.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/drivers/passthrough/pci.c 2024-08-14 09:03:57.000000000 +0000 @@ -52,9 +52,10 @@ static spinlock_t _pcidevs_lock = SPIN_LOCK_UNLOCKED; -void pcidevs_lock(void) +/* Do not use, as it has no speculation barrier, use pcidevs_lock() instead. */ +void pcidevs_lock_unsafe(void) { - spin_lock_recursive(&_pcidevs_lock); + _spin_lock_recursive(&_pcidevs_lock); } void pcidevs_unlock(void) diff -Nru xen-4.17.3+10-g091466ba55/xen/drivers/passthrough/vtd/x86/ats.c xen-4.17.5/xen/drivers/passthrough/vtd/x86/ats.c --- xen-4.17.3+10-g091466ba55/xen/drivers/passthrough/vtd/x86/ats.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/drivers/passthrough/vtd/x86/ats.c 2024-08-14 09:03:57.000000000 +0000 @@ -44,7 +44,7 @@ int ats_device(const struct pci_dev *pdev, const struct acpi_drhd_unit *drhd) { struct acpi_drhd_unit *ats_drhd; - int pos; + unsigned int pos, expfl = 0; if ( !ats_enabled || !iommu_qinval ) return 0; @@ -53,7 +53,13 @@ !ecap_dev_iotlb(drhd->iommu->ecap) ) return 0; - if ( !acpi_find_matched_atsr_unit(pdev) ) + pos = pci_find_cap_offset(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn), + PCI_FUNC(pdev->devfn), PCI_CAP_ID_EXP); + if ( pos ) + expfl = pci_conf_read16(pdev->sbdf, pos + PCI_EXP_FLAGS); + + if ( MASK_EXTR(expfl, PCI_EXP_FLAGS_TYPE) != PCI_EXP_TYPE_RC_END && + !acpi_find_matched_atsr_unit(pdev) ) return 0; ats_drhd = find_ats_dev_drhd(drhd->iommu); diff -Nru xen-4.17.3+10-g091466ba55/xen/drivers/passthrough/x86/iommu.c xen-4.17.5/xen/drivers/passthrough/x86/iommu.c --- xen-4.17.3+10-g091466ba55/xen/drivers/passthrough/x86/iommu.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/drivers/passthrough/x86/iommu.c 2024-08-14 09:03:57.000000000 +0000 @@ -251,24 +251,36 @@ if ( p2ma == p2m_access_x ) return -ENOENT; - while ( base_pfn < end_pfn ) - { - int err = set_identity_p2m_entry(d, base_pfn, p2ma, flag); - - if ( err ) - return err; - base_pfn++; - } - map = xmalloc(struct identity_map); if ( !map ) return -ENOMEM; + map->base = base; map->end = end; map->access = p2ma; map->count = 1; + + /* + * Insert into list ahead of mapping, so the range can be found when + * trying to clean up. + */ list_add_tail(&map->list, &hd->arch.identity_maps); + for ( ; base_pfn < end_pfn; ++base_pfn ) + { + int err = set_identity_p2m_entry(d, base_pfn, p2ma, flag); + + if ( !err ) + continue; + + if ( (map->base >> PAGE_SHIFT_4K) == base_pfn ) + { + list_del(&map->list); + xfree(map); + } + return err; + } + return 0; } diff -Nru xen-4.17.3+10-g091466ba55/xen/include/hypercall-defs.c xen-4.17.5/xen/include/hypercall-defs.c --- xen-4.17.3+10-g091466ba55/xen/include/hypercall-defs.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/include/hypercall-defs.c 2024-08-14 09:03:57.000000000 +0000 @@ -127,7 +127,7 @@ #ifdef CONFIG_COMPAT prefix: compat -set_timer_op(uint32_t lo, int32_t hi) +set_timer_op(uint32_t lo, uint32_t hi) multicall(multicall_entry_compat_t *call_list, uint32_t nr_calls) memory_op(unsigned int cmd, void *arg) #ifdef CONFIG_IOREQ_SERVER diff -Nru xen-4.17.3+10-g091466ba55/xen/include/public/arch-x86/cpufeatureset.h xen-4.17.5/xen/include/public/arch-x86/cpufeatureset.h --- xen-4.17.3+10-g091466ba55/xen/include/public/arch-x86/cpufeatureset.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/include/public/arch-x86/cpufeatureset.h 2024-08-14 09:03:57.000000000 +0000 @@ -123,7 +123,7 @@ XEN_CPUFEATURE(DCA, 1*32+18) /* Direct Cache Access */ XEN_CPUFEATURE(SSE4_1, 1*32+19) /*A Streaming SIMD Extensions 4.1 */ XEN_CPUFEATURE(SSE4_2, 1*32+20) /*A Streaming SIMD Extensions 4.2 */ -XEN_CPUFEATURE(X2APIC, 1*32+21) /*!A Extended xAPIC */ +XEN_CPUFEATURE(X2APIC, 1*32+21) /*!S Extended xAPIC */ XEN_CPUFEATURE(MOVBE, 1*32+22) /*A movbe instruction */ XEN_CPUFEATURE(POPCNT, 1*32+23) /*A POPCNT instruction */ XEN_CPUFEATURE(TSC_DEADLINE, 1*32+24) /*S TSC Deadline Timer */ @@ -260,10 +260,11 @@ XEN_CPUFEATURE(FSRM, 9*32+ 4) /*A Fast Short REP MOVS */ XEN_CPUFEATURE(AVX512_VP2INTERSECT, 9*32+8) /*a VP2INTERSECT{D,Q} insns */ XEN_CPUFEATURE(SRBDS_CTRL, 9*32+ 9) /* MSR_MCU_OPT_CTRL and RNGDS_MITG_DIS. */ -XEN_CPUFEATURE(MD_CLEAR, 9*32+10) /*A VERW clears microarchitectural buffers */ -XEN_CPUFEATURE(RTM_ALWAYS_ABORT, 9*32+11) /*! June 2021 TSX defeaturing in microcode. */ +XEN_CPUFEATURE(MD_CLEAR, 9*32+10) /*!A VERW clears microarchitectural buffers */ +XEN_CPUFEATURE(RTM_ALWAYS_ABORT, 9*32+11) /*! RTM disabled (but XBEGIN wont fault) */ XEN_CPUFEATURE(TSX_FORCE_ABORT, 9*32+13) /* MSR_TSX_FORCE_ABORT.RTM_ABORT */ XEN_CPUFEATURE(SERIALIZE, 9*32+14) /*A SERIALIZE insn */ +XEN_CPUFEATURE(HYBRID, 9*32+15) /* Heterogeneous platform */ XEN_CPUFEATURE(TSXLDTRK, 9*32+16) /*a TSX load tracking suspend/resume insns */ XEN_CPUFEATURE(CET_IBT, 9*32+20) /* CET - Indirect Branch Tracking */ XEN_CPUFEATURE(AVX512_FP16, 9*32+23) /* AVX512 FP16 instructions */ @@ -294,9 +295,9 @@ /* Intel-defined CPU features, CPUID level 0x00000007:2.edx, word 13 */ XEN_CPUFEATURE(INTEL_PSFD, 13*32+ 0) /*A MSR_SPEC_CTRL.PSFD */ -XEN_CPUFEATURE(IPRED_CTRL, 13*32+ 1) /* MSR_SPEC_CTRL.IPRED_DIS_* */ -XEN_CPUFEATURE(RRSBA_CTRL, 13*32+ 2) /* MSR_SPEC_CTRL.RRSBA_DIS_* */ -XEN_CPUFEATURE(BHI_CTRL, 13*32+ 4) /* MSR_SPEC_CTRL.BHI_DIS_S */ +XEN_CPUFEATURE(IPRED_CTRL, 13*32+ 1) /*S MSR_SPEC_CTRL.IPRED_DIS_* */ +XEN_CPUFEATURE(RRSBA_CTRL, 13*32+ 2) /*S MSR_SPEC_CTRL.RRSBA_DIS_* */ +XEN_CPUFEATURE(BHI_CTRL, 13*32+ 4) /*S MSR_SPEC_CTRL.BHI_DIS_S */ XEN_CPUFEATURE(MCDT_NO, 13*32+ 5) /*A MCDT_NO */ /* Intel-defined CPU features, CPUID level 0x00000007:1.ecx, word 14 */ @@ -321,7 +322,7 @@ XEN_CPUFEATURE(SBDR_SSDP_NO, 16*32+13) /*A No Shared Buffer Data Read or Sideband Stale Data Propagation */ XEN_CPUFEATURE(FBSDP_NO, 16*32+14) /*A No Fill Buffer Stale Data Propagation */ XEN_CPUFEATURE(PSDP_NO, 16*32+15) /*A No Primary Stale Data Propagation */ -XEN_CPUFEATURE(FB_CLEAR, 16*32+17) /*A Fill Buffers cleared by VERW */ +XEN_CPUFEATURE(FB_CLEAR, 16*32+17) /*!A Fill Buffers cleared by VERW */ XEN_CPUFEATURE(FB_CLEAR_CTRL, 16*32+18) /* MSR_OPT_CPU_CTRL.FB_CLEAR_DIS */ XEN_CPUFEATURE(RRSBA, 16*32+19) /*! Restricted RSB Alternative */ XEN_CPUFEATURE(BHI_NO, 16*32+20) /*A No Branch History Injection */ @@ -330,6 +331,8 @@ XEN_CPUFEATURE(PBRSB_NO, 16*32+24) /*A No Post-Barrier RSB predictions */ XEN_CPUFEATURE(GDS_CTRL, 16*32+25) /* MCU_OPT_CTRL.GDS_MIT_{DIS,LOCK} */ XEN_CPUFEATURE(GDS_NO, 16*32+26) /*A No Gather Data Sampling */ +XEN_CPUFEATURE(RFDS_NO, 16*32+27) /*A No Register File Data Sampling */ +XEN_CPUFEATURE(RFDS_CLEAR, 16*32+28) /*!A Register File(s) cleared by VERW */ /* Intel-defined CPU features, MSR_ARCH_CAPS 0x10a.edx, word 17 */ diff -Nru xen-4.17.3+10-g091466ba55/xen/include/xen/alternative-call.h xen-4.17.5/xen/include/xen/alternative-call.h --- xen-4.17.3+10-g091466ba55/xen/include/xen/alternative-call.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/include/xen/alternative-call.h 2024-08-14 09:03:57.000000000 +0000 @@ -50,7 +50,12 @@ #include -#define __alt_call_maybe_initdata __initdata +#ifdef CONFIG_LIVEPATCH +/* Must keep for livepatches to resolve alternative calls. */ +# define __alt_call_maybe_initdata __ro_after_init +#else +# define __alt_call_maybe_initdata __initdata +#endif #else diff -Nru xen-4.17.3+10-g091466ba55/xen/include/xen/cpu.h xen-4.17.5/xen/include/xen/cpu.h --- xen-4.17.3+10-g091466ba55/xen/include/xen/cpu.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/include/xen/cpu.h 2024-08-14 09:03:57.000000000 +0000 @@ -13,6 +13,16 @@ void cpu_hotplug_begin(void); void cpu_hotplug_done(void); +/* + * Returns true when the caller CPU is between a cpu_hotplug_{begin,done}() + * region. + * + * This is required to safely identify hotplug contexts, as get_cpu_maps() + * would otherwise succeed because a caller holding the lock in write mode is + * allowed to acquire the same lock in read mode. + */ +bool cpu_in_hotplug_context(void); + /* Receive notification of CPU hotplug events. */ void register_cpu_notifier(struct notifier_block *nb); diff -Nru xen-4.17.3+10-g091466ba55/xen/include/xen/elfstructs.h xen-4.17.5/xen/include/xen/elfstructs.h --- xen-4.17.3+10-g091466ba55/xen/include/xen/elfstructs.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/include/xen/elfstructs.h 2024-08-14 09:03:57.000000000 +0000 @@ -563,6 +563,7 @@ #if defined(ELFSIZE) && (ELFSIZE == 32) #define PRIxElfAddr PRIx32 #define PRIuElfWord PRIu32 +#define PRIxElfWord PRIx32 #define Elf_Ehdr Elf32_Ehdr #define Elf_Phdr Elf32_Phdr @@ -591,6 +592,7 @@ #elif defined(ELFSIZE) && (ELFSIZE == 64) #define PRIxElfAddr PRIx64 #define PRIuElfWord PRIu64 +#define PRIxElfWord PRIx64 #define Elf_Ehdr Elf64_Ehdr #define Elf_Phdr Elf64_Phdr diff -Nru xen-4.17.3+10-g091466ba55/xen/include/xen/event.h xen-4.17.5/xen/include/xen/event.h --- xen-4.17.3+10-g091466ba55/xen/include/xen/event.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/include/xen/event.h 2024-08-14 09:03:57.000000000 +0000 @@ -114,12 +114,12 @@ #define bucket_from_port(d, p) \ ((group_from_port(d, p))[((p) % EVTCHNS_PER_GROUP) / EVTCHNS_PER_BUCKET]) -static inline void evtchn_read_lock(struct evtchn *evtchn) +static always_inline void evtchn_read_lock(struct evtchn *evtchn) { read_lock(&evtchn->lock); } -static inline bool evtchn_read_trylock(struct evtchn *evtchn) +static always_inline bool evtchn_read_trylock(struct evtchn *evtchn) { return read_trylock(&evtchn->lock); } diff -Nru xen-4.17.3+10-g091466ba55/xen/include/xen/irq.h xen-4.17.5/xen/include/xen/irq.h --- xen-4.17.3+10-g091466ba55/xen/include/xen/irq.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/include/xen/irq.h 2024-08-14 09:03:57.000000000 +0000 @@ -130,6 +130,27 @@ #define irq_disable_none irq_actor_none #define irq_enable_none irq_actor_none +/* + * Per-cpu interrupted context register state - the inner-most interrupt frame + * on the stack. + */ +DECLARE_PER_CPU(struct cpu_user_regs *, irq_regs); + +static inline struct cpu_user_regs *get_irq_regs(void) +{ + return this_cpu(irq_regs); +} + +static inline struct cpu_user_regs *set_irq_regs(struct cpu_user_regs *new_regs) +{ + struct cpu_user_regs *old_regs, **pp_regs = &this_cpu(irq_regs); + + old_regs = *pp_regs; + *pp_regs = new_regs; + + return old_regs; +} + struct domain; struct vcpu; @@ -158,7 +179,7 @@ void pirq_cleanup_check(struct pirq *, struct domain *); #define pirq_cleanup_check(pirq, d) \ - ((pirq)->evtchn ? pirq_cleanup_check(pirq, d) : (void)0) + (!(pirq)->evtchn ? pirq_cleanup_check(pirq, d) : (void)0) extern void pirq_guest_eoi(struct pirq *); extern void desc_guest_eoi(struct irq_desc *, struct pirq *); @@ -173,8 +194,9 @@ unsigned int set_desc_affinity(struct irq_desc *, const cpumask_t *); +/* When passed a system domain, this returns the maximum permissible value. */ #ifndef arch_hwdom_irqs -unsigned int arch_hwdom_irqs(domid_t); +unsigned int arch_hwdom_irqs(const struct domain *); #endif #ifndef arch_evtchn_bind_pirq diff -Nru xen-4.17.3+10-g091466ba55/xen/include/xen/livepatch.h xen-4.17.5/xen/include/xen/livepatch.h --- xen-4.17.3+10-g091466ba55/xen/include/xen/livepatch.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/include/xen/livepatch.h 2024-08-14 09:03:57.000000000 +0000 @@ -136,35 +136,11 @@ void arch_livepatch_mask(void); void arch_livepatch_unmask(void); -static inline void common_livepatch_apply(const struct livepatch_func *func, - struct livepatch_fstate *state) -{ - /* If the action has been already executed on this function, do nothing. */ - if ( state->applied == LIVEPATCH_FUNC_APPLIED ) - { - printk(XENLOG_WARNING LIVEPATCH "%s: %s has been already applied before\n", - __func__, func->name); - return; - } +/* Only for testing purposes. */ +struct payload; +int revert_payload(struct payload *data); +void revert_payload_tail(struct payload *data); - arch_livepatch_apply(func, state); - state->applied = LIVEPATCH_FUNC_APPLIED; -} - -static inline void common_livepatch_revert(const struct livepatch_func *func, - struct livepatch_fstate *state) -{ - /* If the apply action hasn't been executed on this function, do nothing. */ - if ( !func->old_addr || state->applied == LIVEPATCH_FUNC_NOT_APPLIED ) - { - printk(XENLOG_WARNING LIVEPATCH "%s: %s has not been applied before\n", - __func__, func->name); - return; - } - - arch_livepatch_revert(func, state); - state->applied = LIVEPATCH_FUNC_NOT_APPLIED; -} #else /* diff -Nru xen-4.17.3+10-g091466ba55/xen/include/xen/mm.h xen-4.17.5/xen/include/xen/mm.h --- xen-4.17.3+10-g091466ba55/xen/include/xen/mm.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/include/xen/mm.h 2024-08-14 09:03:57.000000000 +0000 @@ -80,8 +80,9 @@ /* Free an allocation, and zero the pointer to it. */ #define FREE_XENHEAP_PAGES(p, o) do { \ - free_xenheap_pages(p, o); \ + void *_ptr_ = (p); \ (p) = NULL; \ + free_xenheap_pages(_ptr_, o); \ } while ( false ) #define FREE_XENHEAP_PAGE(p) FREE_XENHEAP_PAGES(p, 0) diff -Nru xen-4.17.3+10-g091466ba55/xen/include/xen/nospec.h xen-4.17.5/xen/include/xen/nospec.h --- xen-4.17.3+10-g091466ba55/xen/include/xen/nospec.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/include/xen/nospec.h 2024-08-14 09:03:57.000000000 +0000 @@ -70,6 +70,21 @@ #define array_access_nospec(array, index) \ (array)[array_index_nospec(index, ARRAY_SIZE(array))] +static always_inline void block_lock_speculation(void) +{ +#ifdef CONFIG_SPECULATIVE_HARDEN_LOCK + arch_block_lock_speculation(); +#endif +} + +static always_inline bool lock_evaluate_nospec(bool condition) +{ +#ifdef CONFIG_SPECULATIVE_HARDEN_LOCK + return arch_lock_evaluate_nospec(condition); +#endif + return condition; +} + #endif /* XEN_NOSPEC_H */ /* diff -Nru xen-4.17.3+10-g091466ba55/xen/include/xen/param.h xen-4.17.5/xen/include/xen/param.h --- xen-4.17.3+10-g091466ba55/xen/include/xen/param.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/include/xen/param.h 2024-08-14 09:03:57.000000000 +0000 @@ -191,7 +191,7 @@ { int len = e ? ({ ASSERT(e >= s); e - s; }) : strlen(s); - printk(XENLOG_INFO "CONFIG_%s disabled - ignoring '%s=%*s' setting\n", + printk(XENLOG_INFO "CONFIG_%s disabled - ignoring '%s=%.*s' setting\n", cfg, param, len, s); } diff -Nru xen-4.17.3+10-g091466ba55/xen/include/xen/pci.h xen-4.17.5/xen/include/xen/pci.h --- xen-4.17.3+10-g091466ba55/xen/include/xen/pci.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/include/xen/pci.h 2024-08-14 09:03:57.000000000 +0000 @@ -155,8 +155,12 @@ * devices, it also sync the access to the msi capability that is not * interrupt handling related (the mask bit register). */ - -void pcidevs_lock(void); +void pcidevs_lock_unsafe(void); +static always_inline void pcidevs_lock(void) +{ + pcidevs_lock_unsafe(); + block_lock_speculation(); +} void pcidevs_unlock(void); bool_t __must_check pcidevs_locked(void); diff -Nru xen-4.17.3+10-g091466ba55/xen/include/xen/rwlock.h xen-4.17.5/xen/include/xen/rwlock.h --- xen-4.17.3+10-g091466ba55/xen/include/xen/rwlock.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/include/xen/rwlock.h 2024-08-14 09:03:57.000000000 +0000 @@ -247,27 +247,49 @@ return (atomic_read(&lock->cnts) & _QW_WMASK) == _QW_LOCKED; } -#define read_lock(l) _read_lock(l) -#define read_lock_irq(l) _read_lock_irq(l) +static always_inline void read_lock(rwlock_t *l) +{ + _read_lock(l); + block_lock_speculation(); +} + +static always_inline void read_lock_irq(rwlock_t *l) +{ + _read_lock_irq(l); + block_lock_speculation(); +} + #define read_lock_irqsave(l, f) \ ({ \ BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long)); \ ((f) = _read_lock_irqsave(l)); \ + block_lock_speculation(); \ }) #define read_unlock(l) _read_unlock(l) #define read_unlock_irq(l) _read_unlock_irq(l) #define read_unlock_irqrestore(l, f) _read_unlock_irqrestore(l, f) -#define read_trylock(l) _read_trylock(l) +#define read_trylock(l) lock_evaluate_nospec(_read_trylock(l)) + +static always_inline void write_lock(rwlock_t *l) +{ + _write_lock(l); + block_lock_speculation(); +} + +static always_inline void write_lock_irq(rwlock_t *l) +{ + _write_lock_irq(l); + block_lock_speculation(); +} -#define write_lock(l) _write_lock(l) -#define write_lock_irq(l) _write_lock_irq(l) #define write_lock_irqsave(l, f) \ ({ \ BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long)); \ ((f) = _write_lock_irqsave(l)); \ + block_lock_speculation(); \ }) -#define write_trylock(l) _write_trylock(l) +#define write_trylock(l) lock_evaluate_nospec(_write_trylock(l)) #define write_unlock(l) _write_unlock(l) #define write_unlock_irq(l) _write_unlock_irq(l) @@ -275,6 +297,8 @@ #define rw_is_locked(l) _rw_is_locked(l) #define rw_is_write_locked(l) _rw_is_write_locked(l) +#define rw_is_write_locked_by_me(l) \ + lock_evaluate_nospec(_is_write_locked_by_me(atomic_read(&(l)->cnts))) typedef struct percpu_rwlock percpu_rwlock_t; @@ -304,8 +328,8 @@ #define percpu_rwlock_resource_init(l, owner) \ (*(l) = (percpu_rwlock_t)PERCPU_RW_LOCK_UNLOCKED(&get_per_cpu_var(owner))) -static inline void _percpu_read_lock(percpu_rwlock_t **per_cpudata, - percpu_rwlock_t *percpu_rwlock) +static always_inline void _percpu_read_lock(percpu_rwlock_t **per_cpudata, + percpu_rwlock_t *percpu_rwlock) { /* Validate the correct per_cpudata variable has been provided. */ _percpu_rwlock_owner_check(per_cpudata, percpu_rwlock); @@ -340,6 +364,8 @@ } else { + /* Other branch already has a speculation barrier in read_lock(). */ + block_lock_speculation(); /* All other paths have implicit check_lock() calls via read_lock(). */ check_lock(&percpu_rwlock->rwlock.lock.debug, false); } @@ -388,8 +414,12 @@ _percpu_read_lock(&get_per_cpu_var(percpu), lock) #define percpu_read_unlock(percpu, lock) \ _percpu_read_unlock(&get_per_cpu_var(percpu), lock) -#define percpu_write_lock(percpu, lock) \ - _percpu_write_lock(&get_per_cpu_var(percpu), lock) + +#define percpu_write_lock(percpu, lock) \ +({ \ + _percpu_write_lock(&get_per_cpu_var(percpu), lock); \ + block_lock_speculation(); \ +}) #define percpu_write_unlock(percpu, lock) \ _percpu_write_unlock(&get_per_cpu_var(percpu), lock) diff -Nru xen-4.17.3+10-g091466ba55/xen/include/xen/spinlock.h xen-4.17.5/xen/include/xen/spinlock.h --- xen-4.17.3+10-g091466ba55/xen/include/xen/spinlock.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/include/xen/spinlock.h 2024-08-14 09:03:57.000000000 +0000 @@ -1,6 +1,7 @@ #ifndef __SPINLOCK_H__ #define __SPINLOCK_H__ +#include #include #include #include @@ -189,21 +190,46 @@ void _spin_lock_recursive(spinlock_t *lock); void _spin_unlock_recursive(spinlock_t *lock); -#define spin_lock(l) _spin_lock(l) -#define spin_lock_cb(l, c, d) _spin_lock_cb(l, c, d) -#define spin_lock_irq(l) _spin_lock_irq(l) +static always_inline void spin_lock(spinlock_t *l) +{ + _spin_lock(l); + block_lock_speculation(); +} + +static always_inline void spin_lock_cb(spinlock_t *l, void (*c)(void *data), + void *d) +{ + _spin_lock_cb(l, c, d); + block_lock_speculation(); +} + +static always_inline void spin_lock_irq(spinlock_t *l) +{ + _spin_lock_irq(l); + block_lock_speculation(); +} + #define spin_lock_irqsave(l, f) \ ({ \ BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long)); \ ((f) = _spin_lock_irqsave(l)); \ + block_lock_speculation(); \ }) +/* Conditionally take a spinlock in a speculation safe way. */ +static always_inline void spin_lock_if(bool condition, spinlock_t *l) +{ + if ( condition ) + _spin_lock(l); + block_lock_speculation(); +} + #define spin_unlock(l) _spin_unlock(l) #define spin_unlock_irq(l) _spin_unlock_irq(l) #define spin_unlock_irqrestore(l, f) _spin_unlock_irqrestore(l, f) #define spin_is_locked(l) _spin_is_locked(l) -#define spin_trylock(l) _spin_trylock(l) +#define spin_trylock(l) lock_evaluate_nospec(_spin_trylock(l)) #define spin_trylock_irqsave(lock, flags) \ ({ \ @@ -224,8 +250,15 @@ * are any critical regions that cannot form part of such a set, they can use * standard spin_[un]lock(). */ -#define spin_trylock_recursive(l) _spin_trylock_recursive(l) -#define spin_lock_recursive(l) _spin_lock_recursive(l) +#define spin_trylock_recursive(l) \ + lock_evaluate_nospec(_spin_trylock_recursive(l)) + +static always_inline void spin_lock_recursive(spinlock_t *l) +{ + _spin_lock_recursive(l); + block_lock_speculation(); +} + #define spin_unlock_recursive(l) _spin_unlock_recursive(l) #endif /* __SPINLOCK_H__ */ diff -Nru xen-4.17.3+10-g091466ba55/xen/include/xen/virtual_region.h xen-4.17.5/xen/include/xen/virtual_region.h --- xen-4.17.3+10-g091466ba55/xen/include/xen/virtual_region.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/include/xen/virtual_region.h 2024-08-14 09:03:57.000000000 +0000 @@ -9,11 +9,21 @@ #include #include +/* + * Despite it's name, this is a module(ish) description. + * + * There's one region for the runtime .text/etc, one region for .init during + * boot only, and one region per livepatch. + */ struct virtual_region { struct list_head list; - const void *start; /* Virtual address start. */ - const void *end; /* Virtual address end. */ + + const void *text_start; /* .text virtual address start. */ + const void *text_end; /* .text virtual address end. */ + + const void *rodata_start; /* .rodata virtual address start (optional). */ + const void *rodata_end; /* .rodata virtual address end. */ /* If this is NULL the default lookup mechanism is used. */ symbols_lookup_t *symbols_lookup; diff -Nru xen-4.17.3+10-g091466ba55/xen/include/xen/xmalloc.h xen-4.17.5/xen/include/xen/xmalloc.h --- xen-4.17.3+10-g091466ba55/xen/include/xen/xmalloc.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/include/xen/xmalloc.h 2024-08-14 09:03:57.000000000 +0000 @@ -66,9 +66,10 @@ extern void xfree(void *); /* Free an allocation, and zero the pointer to it. */ -#define XFREE(p) do { \ - xfree(p); \ - (p) = NULL; \ +#define XFREE(p) do { \ + void *_ptr_ = (p); \ + (p) = NULL; \ + xfree(_ptr_); \ } while ( false ) /* Underlying functions */ diff -Nru xen-4.17.3+10-g091466ba55/xen/include/xsm/dummy.h xen-4.17.5/xen/include/xsm/dummy.h --- xen-4.17.3+10-g091466ba55/xen/include/xsm/dummy.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/include/xsm/dummy.h 2024-08-14 09:03:57.000000000 +0000 @@ -162,7 +162,7 @@ } static XSM_INLINE int cf_check xsm_domctl( - XSM_DEFAULT_ARG struct domain *d, int cmd) + XSM_DEFAULT_ARG struct domain *d, unsigned int cmd, uint32_t ssidref) { XSM_ASSERT_ACTION(XSM_OTHER); switch ( cmd ) diff -Nru xen-4.17.3+10-g091466ba55/xen/include/xsm/xsm.h xen-4.17.5/xen/include/xsm/xsm.h --- xen-4.17.3+10-g091466ba55/xen/include/xsm/xsm.h 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/include/xsm/xsm.h 2024-08-14 09:03:57.000000000 +0000 @@ -60,7 +60,7 @@ int (*domctl_scheduler_op)(struct domain *d, int op); int (*sysctl_scheduler_op)(int op); int (*set_target)(struct domain *d, struct domain *e); - int (*domctl)(struct domain *d, int cmd); + int (*domctl)(struct domain *d, unsigned int cmd, uint32_t ssidref); int (*sysctl)(int cmd); int (*readconsole)(uint32_t clear); @@ -248,9 +248,10 @@ return alternative_call(xsm_ops.set_target, d, e); } -static inline int xsm_domctl(xsm_default_t def, struct domain *d, int cmd) +static inline int xsm_domctl(xsm_default_t def, struct domain *d, + unsigned int cmd, uint32_t ssidref) { - return alternative_call(xsm_ops.domctl, d, cmd); + return alternative_call(xsm_ops.domctl, d, cmd, ssidref); } static inline int xsm_sysctl(xsm_default_t def, int cmd) diff -Nru xen-4.17.3+10-g091466ba55/xen/test/livepatch/Makefile xen-4.17.5/xen/test/livepatch/Makefile --- xen-4.17.3+10-g091466ba55/xen/test/livepatch/Makefile 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/test/livepatch/Makefile 2024-08-14 09:03:57.000000000 +0000 @@ -118,12 +118,12 @@ $(obj)/xen_action_hooks_noapply.o: $(obj)/config.h extra-y += xen_action_hooks_noapply.livepatch -xen_action_hooks_noapply-objs := xen_action_hooks_marker.o xen_hello_world_func.o note.o xen_note.o +xen_action_hooks_noapply-objs := xen_action_hooks_noapply.o xen_hello_world_func.o note.o xen_note.o $(obj)/xen_action_hooks_norevert.o: $(obj)/config.h extra-y += xen_action_hooks_norevert.livepatch -xen_action_hooks_norevert-objs := xen_action_hooks_marker.o xen_hello_world_func.o note.o xen_note.o +xen_action_hooks_norevert-objs := xen_action_hooks_norevert.o xen_hello_world_func.o note.o xen_note.o EXPECT_BYTES_COUNT := 8 CODE_GET_EXPECT=$(shell $(OBJDUMP) -d --insn-width=1 $(1) | sed -n -e '/<'$(2)'>:$$/,/^$$/ p' | tail -n +2 | head -n $(EXPECT_BYTES_COUNT) | awk '{$$0=$$2; printf "%s", substr($$0,length-1)}' | sed 's/.\{2\}/0x&,/g' | sed 's/^/{/;s/,$$/}/g') diff -Nru xen-4.17.3+10-g091466ba55/xen/test/livepatch/xen_action_hooks_norevert.c xen-4.17.5/xen/test/livepatch/xen_action_hooks_norevert.c --- xen-4.17.3+10-g091466ba55/xen/test/livepatch/xen_action_hooks_norevert.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/test/livepatch/xen_action_hooks_norevert.c 2024-08-14 09:03:57.000000000 +0000 @@ -96,31 +96,19 @@ static void post_revert_hook(livepatch_payload_t *payload) { - int i; + unsigned long flags; printk(KERN_DEBUG "%s: Hook starting.\n", __func__); - for (i = 0; i < payload->nfuncs; i++) - { - const struct livepatch_func *func = &payload->funcs[i]; - struct livepatch_fstate *fstate = &payload->fstate[i]; - - BUG_ON(revert_cnt != 1); - BUG_ON(fstate->applied != LIVEPATCH_FUNC_APPLIED); - - /* Outside of quiesce zone: MAY TRIGGER HOST CRASH/UNDEFINED BEHAVIOR */ - arch_livepatch_quiesce(); - common_livepatch_revert(payload); - arch_livepatch_revive(); - BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED); - - printk(KERN_DEBUG "%s: post reverted: %s\n", __func__, func->name); - } + local_irq_save(flags); + BUG_ON(revert_payload(payload)); + revert_payload_tail(payload); + local_irq_restore(flags); printk(KERN_DEBUG "%s: Hook done.\n", __func__); } -LIVEPATCH_APPLY_HOOK(revert_hook); +LIVEPATCH_REVERT_HOOK(revert_hook); LIVEPATCH_PREAPPLY_HOOK(pre_apply_hook); LIVEPATCH_POSTAPPLY_HOOK(post_apply_hook); diff -Nru xen-4.17.3+10-g091466ba55/xen/tools/gen-cpuid.py xen-4.17.5/xen/tools/gen-cpuid.py --- xen-4.17.3+10-g091466ba55/xen/tools/gen-cpuid.py 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/tools/gen-cpuid.py 2024-08-14 09:03:57.000000000 +0000 @@ -47,8 +47,8 @@ """ feat_regex = re.compile( r"^XEN_CPUFEATURE\(([A-Z0-9_]+)," - "\s+([\s\d]+\*[\s\d]+\+[\s\d]+)\)" - "\s+/\*([\w!]*) .*$") + r"\s+([\s\d]+\*[\s\d]+\+[\s\d]+)\)" + r"\s+/\*([\w!]*) .*$") word_regex = re.compile( r"^/\* .* word (\d*) \*/$") @@ -318,7 +318,8 @@ # IBRSB/IBRS, and we pass this MSR directly to guests. Treating them # as dependent features simplifies Xen's logic, and prevents the guest # from seeing implausible configurations. - IBRSB: [STIBP, SSBD, INTEL_PSFD, EIBRS], + IBRSB: [STIBP, SSBD, INTEL_PSFD, EIBRS, + IPRED_CTRL, RRSBA_CTRL, BHI_CTRL], IBRS: [AMD_STIBP, AMD_SSBD, PSFD, IBRS_ALWAYS, IBRS_FAST, IBRS_SAME_MODE], IBPB: [IBPB_RET, SBPB, IBPB_BRTYPE], diff -Nru xen-4.17.3+10-g091466ba55/xen/xsm/flask/hooks.c xen-4.17.5/xen/xsm/flask/hooks.c --- xen-4.17.3+10-g091466ba55/xen/xsm/flask/hooks.c 2024-02-02 07:04:33.000000000 +0000 +++ xen-4.17.5/xen/xsm/flask/hooks.c 2024-08-14 09:03:57.000000000 +0000 @@ -663,12 +663,22 @@ return rc; } -static int cf_check flask_domctl(struct domain *d, int cmd) +static int cf_check flask_domctl(struct domain *d, unsigned int cmd, + uint32_t ssidref) { switch ( cmd ) { - /* These have individual XSM hooks (common/domctl.c) */ case XEN_DOMCTL_createdomain: + /* + * There is a later hook too, but at this early point simply check + * that the calling domain is privileged enough to create a domain. + * + * Note that d is NULL because we haven't even allocated memory for it + * this early in XEN_DOMCTL_createdomain. + */ + return avc_current_has_perm(ssidref, SECCLASS_DOMAIN, DOMAIN__CREATE, NULL); + + /* These have individual XSM hooks (common/domctl.c) */ case XEN_DOMCTL_getdomaininfo: case XEN_DOMCTL_scheduler_op: case XEN_DOMCTL_irq_permission: