Version in base suite: 4.11.1+92-g6c33308a8d-2 Base version: xen_4.11.1+92-g6c33308a8d-2 Target version: xen_4.11.3+24-g14b62ab3e5-1~deb10u1 Base file: /srv/ftp-master.debian.org/ftp/pool/main/x/xen/xen_4.11.1+92-g6c33308a8d-2.dsc Target file: /srv/ftp-master.debian.org/policy/pool/main/x/xen/xen_4.11.3+24-g14b62ab3e5-1~deb10u1.dsc Config.mk | 6 debian/.gitignore | 39 debian/changelog | 72 + debian/patches/0004-Various-Fix-typo-occured.patch | 6 debian/patches/0007-Various-Fix-typo-infomation.patch | 4 debian/patches/0008-Various-Fix-typo-mappping.patch | 4 debian/patches/0044-Fix-empty-fields-in-first-hypervisor-log-line.patch | 2 debian/patches/misc/version.diff | 2 debian/patches/prefix-abiname/config-prefix.diff | 2 docs/misc/xen-command-line.markdown | 83 + tools/libxc/xc_cpuid_x86.c | 32 tools/libxl/libxl_pci.c | 25 tools/tests/vpci/emul.h | 3 tools/xentrace/xenctx.c | 4 xen/Makefile | 2 xen/arch/arm/Makefile | 1 xen/arch/arm/README.LinuxPrimitives | 15 xen/arch/arm/arm32/entry.S | 192 ++- xen/arch/arm/arm32/lib/Makefile | 5 xen/arch/arm/arm32/lib/bitops.c | 171 +++ xen/arch/arm/arm32/lib/bitops.h | 104 -- xen/arch/arm/arm32/lib/changebit.S | 14 xen/arch/arm/arm32/lib/clearbit.S | 14 xen/arch/arm/arm32/lib/findbit.S | 16 xen/arch/arm/arm32/lib/setbit.S | 15 xen/arch/arm/arm32/lib/testchangebit.S | 15 xen/arch/arm/arm32/lib/testclearbit.S | 15 xen/arch/arm/arm32/lib/testsetbit.S | 15 xen/arch/arm/arm64/entry.S | 47 xen/arch/arm/arm64/head.S | 11 xen/arch/arm/arm64/lib/bitops.S | 67 - xen/arch/arm/arm64/lib/bitops.c | 161 +++ xen/arch/arm/domain.c | 39 xen/arch/arm/gic-v2.c | 11 xen/arch/arm/gic-v3.c | 17 xen/arch/arm/guest_atomics.c | 91 + xen/arch/arm/guest_walk.c | 2 xen/arch/arm/irq.c | 9 xen/arch/arm/livepatch.c | 5 xen/arch/arm/mm.c | 40 xen/arch/arm/p2m.c | 42 xen/arch/arm/setup.c | 10 xen/arch/arm/time.c | 4 xen/arch/arm/traps.c | 33 xen/arch/arm/vgic.c | 5 xen/arch/arm/vgic/vgic.c | 5 xen/arch/arm/vsmc.c | 4 xen/arch/arm/xen.lds.S | 6 xen/arch/x86/Makefile | 45 xen/arch/x86/acpi/boot.c | 23 xen/arch/x86/apic.c | 6 xen/arch/x86/boot/Makefile | 5 xen/arch/x86/boot/wakeup.S | 15 xen/arch/x86/boot/x86_64.S | 26 xen/arch/x86/cpu/amd.c | 2 xen/arch/x86/cpu/common.c | 22 xen/arch/x86/cpu/vpmu.c | 47 xen/arch/x86/cpuid.c | 20 xen/arch/x86/crash.c | 26 xen/arch/x86/domain.c | 83 + xen/arch/x86/domctl.c | 30 xen/arch/x86/efi/efi-boot.h | 8 xen/arch/x86/flushtlb.c | 9 xen/arch/x86/hvm/hpet.c | 8 xen/arch/x86/hvm/hvm.c | 43 xen/arch/x86/hvm/ioreq.c | 4 xen/arch/x86/hvm/irq.c | 15 xen/arch/x86/hvm/svm/emulate.c | 54 + xen/arch/x86/hvm/svm/svm.c | 83 - xen/arch/x86/hvm/svm/svmdebug.c | 9 xen/arch/x86/hvm/svm/vmcb.c | 2 xen/arch/x86/hvm/vlapic.c | 7 xen/arch/x86/hvm/vmx/vmcs.c | 55 + xen/arch/x86/hvm/vmx/vmx.c | 275 ++++- xen/arch/x86/hvm/vmx/vvmx.c | 24 xen/arch/x86/hypercall.c | 20 xen/arch/x86/io_apic.c | 5 xen/arch/x86/irq.c | 3 xen/arch/x86/livepatch.c | 36 xen/arch/x86/microcode.c | 12 xen/arch/x86/mm.c | 512 ++++++---- xen/arch/x86/mm/p2m-ept.c | 11 xen/arch/x86/mm/p2m.c | 9 xen/arch/x86/mm/shadow/common.c | 3 xen/arch/x86/monitor.c | 3 xen/arch/x86/msi.c | 28 xen/arch/x86/msr.c | 37 xen/arch/x86/psr.c | 11 xen/arch/x86/pv/emul-gate-op.c | 10 xen/arch/x86/pv/emul-priv-op.c | 38 xen/arch/x86/pv/emulate.c | 9 xen/arch/x86/pv/mm.c | 8 xen/arch/x86/pv/shim.c | 3 xen/arch/x86/setup.c | 22 xen/arch/x86/smp.c | 40 xen/arch/x86/smpboot.c | 10 xen/arch/x86/spec_ctrl.c | 65 + xen/arch/x86/time.c | 38 xen/arch/x86/traps.c | 18 xen/arch/x86/tsx.c | 75 + xen/arch/x86/x86_64/mm.c | 9 xen/arch/x86/x86_emulate/x86_emulate.c | 4 xen/arch/x86/xen.lds.S | 6 xen/common/Kconfig | 33 xen/common/compat/domain.c | 2 xen/common/domain.c | 14 xen/common/domctl.c | 14 xen/common/efi/boot.c | 10 xen/common/event_2l.c | 26 xen/common/event_fifo.c | 44 xen/common/grant_table.c | 141 +- xen/common/livepatch.c | 8 xen/common/lz4/decompress.c | 16 xen/common/memory.c | 2 xen/common/page_alloc.c | 12 xen/common/sched_credit2.c | 2 xen/common/schedule.c | 2 xen/drivers/passthrough/amd/iommu_init.c | 11 xen/drivers/passthrough/amd/iommu_intr.c | 5 xen/drivers/passthrough/amd/iommu_map.c | 205 +--- xen/drivers/passthrough/amd/pci_amd_iommu.c | 22 xen/drivers/passthrough/device_tree.c | 6 xen/drivers/passthrough/iommu.c | 27 xen/drivers/passthrough/pci.c | 67 + xen/drivers/passthrough/vtd/extern.h | 2 xen/drivers/passthrough/vtd/iommu.c | 94 + xen/drivers/passthrough/vtd/iommu.h | 3 xen/drivers/passthrough/vtd/qinval.c | 9 xen/drivers/passthrough/vtd/quirks.c | 25 xen/drivers/video/vesa.c | 14 xen/drivers/vpci/vpci.c | 5 xen/include/asm-arm/arm32/bitops.h | 14 xen/include/asm-arm/arm32/cmpxchg.h | 127 +- xen/include/asm-arm/arm64/bitops.h | 10 xen/include/asm-arm/arm64/cmpxchg.h | 146 +- xen/include/asm-arm/atomic.h | 15 xen/include/asm-arm/bitops.h | 38 xen/include/asm-arm/domain.h | 3 xen/include/asm-arm/grant_table.h | 2 xen/include/asm-arm/guest_atomics.h | 126 ++ xen/include/asm-arm/mm.h | 8 xen/include/asm-arm/p2m.h | 5 xen/include/asm-arm/percpu.h | 6 xen/include/asm-arm/perfc_defn.h | 3 xen/include/asm-arm/time.h | 7 xen/include/asm-x86/bitops.h | 4 xen/include/asm-x86/domain.h | 4 xen/include/asm-x86/grant_table.h | 3 xen/include/asm-x86/guest_atomics.h | 32 xen/include/asm-x86/hvm/hvm.h | 13 xen/include/asm-x86/hvm/svm/amd-iommu-proto.h | 3 xen/include/asm-x86/hvm/svm/emulate.h | 2 xen/include/asm-x86/hvm/svm/svmdebug.h | 1 xen/include/asm-x86/hvm/vlapic.h | 6 xen/include/asm-x86/hvm/vmx/vmcs.h | 6 xen/include/asm-x86/hvm/vmx/vmx.h | 2 xen/include/asm-x86/mm.h | 49 xen/include/asm-x86/msi.h | 1 xen/include/asm-x86/msr-index.h | 7 xen/include/asm-x86/percpu.h | 6 xen/include/asm-x86/processor.h | 22 xen/include/public/arch-arm.h | 4 xen/include/public/domctl.h | 4 xen/include/public/xen-compat.h | 2 xen/include/public/xen.h | 6 xen/include/xen/cpuidle.h | 2 xen/include/xen/event.h | 3 xen/include/xen/iommu.h | 3 xen/include/xen/lib.h | 10 xen/include/xen/livepatch.h | 1 xen/include/xen/pci.h | 3 xen/include/xen/percpu.h | 12 xen/include/xsm/dummy.h | 3 xen/include/xsm/xsm.h | 11 xen/xsm/Makefile | 1 xen/xsm/dummy.c | 1 xen/xsm/silo.c | 108 ++ xen/xsm/xsm_core.c | 57 + 178 files changed, 3677 insertions(+), 1524 deletions(-) diff -Nru xen-4.11.1+92-g6c33308a8d/Config.mk xen-4.11.3+24-g14b62ab3e5/Config.mk --- xen-4.11.1+92-g6c33308a8d/Config.mk 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/Config.mk 2019-12-11 14:35:39.000000000 +0000 @@ -275,15 +275,15 @@ MINIOS_UPSTREAM_URL ?= git://xenbits.xen.org/mini-os.git endif OVMF_UPSTREAM_REVISION ?= 947f3737abf65fda63f3ffd97fddfa6986986868 -QEMU_UPSTREAM_REVISION ?= qemu-xen-4.11.1 -MINIOS_UPSTREAM_REVISION ?= xen-RELEASE-4.11.1 +QEMU_UPSTREAM_REVISION ?= qemu-xen-4.11.3 +MINIOS_UPSTREAM_REVISION ?= xen-RELEASE-4.11.3 SEABIOS_UPSTREAM_REVISION ?= rel-1.11.1 ETHERBOOT_NICS ?= rtl8139 8086100e -QEMU_TRADITIONAL_REVISION ?= xen-4.11.1 +QEMU_TRADITIONAL_REVISION ?= xen-4.11.3 # Fri Sep 15 19:37:27 2017 +0100 # qemu-xen-traditional: Link against xentoolcore diff -Nru xen-4.11.1+92-g6c33308a8d/debian/.gitignore xen-4.11.3+24-g14b62ab3e5/debian/.gitignore --- xen-4.11.1+92-g6c33308a8d/debian/.gitignore 2019-06-22 09:15:08.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/debian/.gitignore 1970-01-01 00:00:00.000000000 +0000 @@ -1,39 +0,0 @@ -.debhelper -*.debhelper.* -*.preinst.debhelper -*.postinst.debhelper -*.prerm.debhelper -*.postrm.debhelper -*.substvars -*.stamp -tmp -*-[0-9]*.bug-control -*-[0-9]*.postinst -*-[0-9]*.postrm -*.tmp -files -xen-doc -xen-hypervisor-common -xen-system-amd64 -xen-system-armhf -xen-system-arm64 -xen-hypervisor-[0-9]*[0-9] -xen-hypervisor-[0-9]*[0-9].install -xen-hypervisor-[0-9]*[0-9].lintian-overrides -xen-utils-[0-9]*[0-9] -xen-utils-[0-9]*[0-9].install -xen-utils-[0-9]*[0-9].NEWS -xen-utils-[0-9]*[0-9].README.Debian -xen-utils-[0-9]*[0-9].lintian-overrides -xen-utils-[0-9]*[0-9].prerm -libxenmisc[0-9]*[0-9].lintian-overrides -libxenmisc[0-9]*[0-9] -libxenmisc[0-9]*[0-9].install -libxenmisc[0-9]*[0-9].lintian-overrides -libxen-dev -libxen*[0-9] -xen-utils-common -xenstore-utils -autoreconf.before -autoreconf.after -debhelper-build-stamp diff -Nru xen-4.11.1+92-g6c33308a8d/debian/changelog xen-4.11.3+24-g14b62ab3e5/debian/changelog --- xen-4.11.1+92-g6c33308a8d/debian/changelog 2019-06-22 09:15:08.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/debian/changelog 2020-01-08 12:21:23.000000000 +0000 @@ -1,3 +1,47 @@ +xen (4.11.3+24-g14b62ab3e5-1~deb10u1) buster-security; urgency=high + + * Rebuild for buster-security + + -- Hans van Kranenburg Wed, 08 Jan 2020 13:21:23 +0100 + +xen (4.11.3+24-g14b62ab3e5-1) unstable; urgency=high + + * Update to new upstream version 4.11.3+24-g14b62ab3e5, which also + contains the following security fixes: (Closes: #947944) + - Unlimited Arm Atomics Operations + XSA-295 CVE-2019-17349 CVE-2019-17350 + - VCPUOP_initialise DoS + XSA-296 CVE-2019-18420 + - missing descriptor table limit checking in x86 PV emulation + XSA-298 CVE-2019-18425 + - Issues with restartable PV type change operations + XSA-299 CVE-2019-18421 + - add-to-physmap can be abused to DoS Arm hosts + XSA-301 CVE-2019-18423 + - passed through PCI devices may corrupt host memory after deassignment + XSA-302 CVE-2019-18424 + - ARM: Interrupts are unconditionally unmasked in exception handlers + XSA-303 CVE-2019-18422 + - x86: Machine Check Error on Page Size Change DoS + XSA-304 CVE-2018-12207 + - TSX Asynchronous Abort speculative side channel + XSA-305 CVE-2019-11135 + - Device quarantine for alternate pci assignment methods + XSA-306 CVE-2019-19579 + - find_next_bit() issues + XSA-307 CVE-2019-19581 CVE-2019-19582 + - VMX: VMentry failure with debug exceptions and blocked states + XSA-308 CVE-2019-19583 + - Linear pagetable use / entry miscounts + XSA-309 CVE-2019-19578 + - Further issues with restartable PV type change operations + XSA-310 CVE-2019-19580 + - Bugs in dynamic height handling for AMD IOMMU pagetables + XSA-311 CVE-2019-19577 + * Add missing CVE numbers to previous changelog entries + + -- Hans van Kranenburg Wed, 08 Jan 2020 12:41:42 +0100 + xen (4.11.1+92-g6c33308a8d-2) unstable; urgency=high * Mention MDS and the need for updated microcode and disabling @@ -11,23 +55,23 @@ * Update to new upstream version 4.11.1+92-g6c33308a8d, which also contains the following security fixes: - Fix: grant table transfer issues on large hosts - XSA-284 (no CVE yet) (Closes: #929991) + XSA-284 CVE-2019-17340 (Closes: #929991) - Fix: race with pass-through device hotplug - XSA-285 (no CVE yet) (Closes: #929998) + XSA-285 CVE-2019-17341 (Closes: #929998) - Fix: x86: steal_page violates page_struct access discipline - XSA-287 (no CVE yet) (Closes: #930001) + XSA-287 CVE-2019-17342 (Closes: #930001) - Fix: x86: Inconsistent PV IOMMU discipline - XSA-288 (no CVE yet) (Closes: #929994) + XSA-288 CVE-2019-17343 (Closes: #929994) - Fix: missing preemption in x86 PV page table unvalidation - XSA-290 (no CVE yet) (Closes: #929996) + XSA-290 CVE-2019-17344 (Closes: #929996) - Fix: x86/PV: page type reference counting issue with failed IOMMU update - XSA-291 (no CVE yet) (Closes: #929995) + XSA-291 CVE-2019-17345 (Closes: #929995) - Fix: x86: insufficient TLB flushing when using PCID - XSA-292 (no CVE yet) (Closes: #929993) + XSA-292 CVE-2019-17346 (Closes: #929993) - Fix: x86: PV kernel context switch corruption - XSA-293 (no CVE yet) (Closes: #929999) + XSA-293 CVE-2019-17347 (Closes: #929999) - Fix: x86 shadow: Insufficient TLB flushing when using PCID - XSA-294 (no CVE yet) (Closes: #929992) + XSA-294 CVE-2019-17348 (Closes: #929992) - Fix: Microarchitectural Data Sampling speculative side channel XSA-297 CVE-2018-12126 CVE-2018-12127 CVE-2018-12130 CVE-2019-11091 (Closes: #929129) @@ -362,19 +406,19 @@ (235 already included in 4.8.1-1+deb9u3) XSA-236 CVE-2017-15597 XSA-237 CVE-2017-15590 - XSA-238 (no CVE yet) + XSA-238 CVE-2017-15591 XSA-239 CVE-2017-15589 XSA-240 CVE-2017-15595 XSA-241 CVE-2017-15588 XSA-242 CVE-2017-15593 XSA-243 CVE-2017-15592 XSA-244 CVE-2017-15594 - XSA-245 (no CVE yet) + XSA-245 CVE-2017-17046 and a number of upstream functionality fixes, which are not easily disentangled from the security fixes. * Apply two more security fixes: - XSA-246 (no CVE yet) - XSA-247 (no CVE yet) + XSA-246 CVE-2017-17044 + XSA-247 CVE-2017-17045 -- Ian Jackson Sat, 25 Nov 2017 11:26:37 +0000 @@ -385,7 +429,7 @@ XSA-227 CVE-2017-12137 XSA-228 CVE-2017-12136 XSA-230 CVE-2017-12855 - XSA-235 (no CVE yet) + XSA-235 CVE-2017-15596 * Adjust changelog entry for 4.8.1-1+deb9u2 to record that XSA-225 fix was indeed included. * Security fix for XSA-229 not included as that bug is in Linux, not Xen. diff -Nru xen-4.11.1+92-g6c33308a8d/debian/patches/0004-Various-Fix-typo-occured.patch xen-4.11.3+24-g14b62ab3e5/debian/patches/0004-Various-Fix-typo-occured.patch --- xen-4.11.1+92-g6c33308a8d/debian/patches/0004-Various-Fix-typo-occured.patch 2019-06-22 09:15:08.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/debian/patches/0004-Various-Fix-typo-occured.patch 2020-01-08 12:21:23.000000000 +0000 @@ -41,10 +41,10 @@ /* * For little endian,reverse the low significant equal bits into MSB,then diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c -index 2d3c55a..73d8769 100644 +index aa7d747..4f16783 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c -@@ -1736,7 +1736,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla, +@@ -1741,7 +1741,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla, case NESTEDHVM_PAGEFAULT_RETRY: return 1; case NESTEDHVM_PAGEFAULT_L1_ERROR: @@ -93,7 +93,7 @@ **/ typedef diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h -index fb1df8f..68ee098 100644 +index 6c180c4..5a5a2fb 100644 --- a/xen/include/public/xen.h +++ b/xen/include/public/xen.h @@ -177,7 +177,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_ulong_t); diff -Nru xen-4.11.1+92-g6c33308a8d/debian/patches/0007-Various-Fix-typo-infomation.patch xen-4.11.3+24-g14b62ab3e5/debian/patches/0007-Various-Fix-typo-infomation.patch --- xen-4.11.1+92-g6c33308a8d/debian/patches/0007-Various-Fix-typo-infomation.patch 2019-06-22 09:15:08.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/debian/patches/0007-Various-Fix-typo-infomation.patch 2020-01-08 12:21:23.000000000 +0000 @@ -78,10 +78,10 @@ * For more information about ERST, please refer to ACPI Specification * version 4.0, section 17.4. diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h -index 0535da8..175baa5 100644 +index 0ab9fa0..c0de281 100644 --- a/xen/include/public/domctl.h +++ b/xen/include/public/domctl.h -@@ -491,7 +491,7 @@ struct xen_domctl_assign_device { +@@ -495,7 +495,7 @@ struct xen_domctl_assign_device { } u; }; diff -Nru xen-4.11.1+92-g6c33308a8d/debian/patches/0008-Various-Fix-typo-mappping.patch xen-4.11.3+24-g14b62ab3e5/debian/patches/0008-Various-Fix-typo-mappping.patch --- xen-4.11.1+92-g6c33308a8d/debian/patches/0008-Various-Fix-typo-mappping.patch 2019-06-22 09:15:08.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/debian/patches/0008-Various-Fix-typo-mappping.patch 2020-01-08 12:21:23.000000000 +0000 @@ -22,10 +22,10 @@ " map_limitkb [int]: .\n" "Returns: [int] 0 on success; -1 on error.\n" }, diff --git a/xen/drivers/passthrough/vtd/iommu.h b/xen/drivers/passthrough/vtd/iommu.h -index 72c1a2e..5cd14b8 100644 +index e76b261..179b859 100644 --- a/xen/drivers/passthrough/vtd/iommu.h +++ b/xen/drivers/passthrough/vtd/iommu.h -@@ -510,7 +510,7 @@ struct qi_ctrl { +@@ -509,7 +509,7 @@ struct qi_ctrl { struct ir_ctrl { u64 iremap_maddr; /* interrupt remap table machine address */ int iremap_num; /* total num of used interrupt remap entry */ diff -Nru xen-4.11.1+92-g6c33308a8d/debian/patches/0044-Fix-empty-fields-in-first-hypervisor-log-line.patch xen-4.11.3+24-g14b62ab3e5/debian/patches/0044-Fix-empty-fields-in-first-hypervisor-log-line.patch --- xen-4.11.1+92-g6c33308a8d/debian/patches/0044-Fix-empty-fields-in-first-hypervisor-log-line.patch 2019-06-22 09:15:08.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/debian/patches/0044-Fix-empty-fields-in-first-hypervisor-log-line.patch 2020-01-08 12:21:23.000000000 +0000 @@ -28,7 +28,7 @@ 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xen/Makefile b/xen/Makefile -index c7ed37f..45ba4ce 100644 +index 82ec4af..618777d 100644 --- a/xen/Makefile +++ b/xen/Makefile @@ -173,9 +173,9 @@ include/xen/compile.h: include/xen/compile.h.in diff -Nru xen-4.11.1+92-g6c33308a8d/debian/patches/misc/version.diff xen-4.11.3+24-g14b62ab3e5/debian/patches/misc/version.diff --- xen-4.11.1+92-g6c33308a8d/debian/patches/misc/version.diff 2019-06-22 09:15:08.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/debian/patches/misc/version.diff 2020-01-08 12:21:23.000000000 +0000 @@ -13,7 +13,7 @@ 6 files changed, 29 insertions(+), 29 deletions(-) diff --git a/xen/Makefile b/xen/Makefile -index 047427e..c7ed37f 100644 +index af27147..82ec4af 100644 --- a/xen/Makefile +++ b/xen/Makefile @@ -161,7 +161,7 @@ delete-unfresh-files: diff -Nru xen-4.11.1+92-g6c33308a8d/debian/patches/prefix-abiname/config-prefix.diff xen-4.11.3+24-g14b62ab3e5/debian/patches/prefix-abiname/config-prefix.diff --- xen-4.11.1+92-g6c33308a8d/debian/patches/prefix-abiname/config-prefix.diff 2019-06-22 09:15:08.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/debian/patches/prefix-abiname/config-prefix.diff 2020-01-08 12:21:23.000000000 +0000 @@ -9,7 +9,7 @@ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Config.mk b/Config.mk -index 079f335..7bcb15d 100644 +index f1ed5e9..328e3bb 100644 --- a/Config.mk +++ b/Config.mk @@ -83,7 +83,7 @@ EXTRA_LIB += $(EXTRA_PREFIX)/lib diff -Nru xen-4.11.1+92-g6c33308a8d/docs/misc/xen-command-line.markdown xen-4.11.3+24-g14b62ab3e5/docs/misc/xen-command-line.markdown --- xen-4.11.1+92-g6c33308a8d/docs/misc/xen-command-line.markdown 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/docs/misc/xen-command-line.markdown 2019-12-11 14:35:39.000000000 +0000 @@ -828,7 +828,7 @@ >> set as UC. ### ept (Intel) -> `= List of ( {no-}pml | {no-}ad )` +> `= List of [ {no-}pml, {no-}ad, {no-}exec-sp ]` Controls EPT related features. @@ -851,6 +851,31 @@ >> Have hardware keep accessed/dirty (A/D) bits updated. +* The `exec-sp` boolean controls whether EPT superpages with execute + permissions are permitted. In general this is good for performance. + + However, on processors vulnerable CVE-2018-12207, HVM guest kernels can + use executable superpages to crash the host. By default, executable + superpages are disabled on affected hardware. + + If HVM guest kernels are trusted not to mount a DoS against the system, + this option can enabled to regain performance. + + This boolean may be modified at runtime using `xl set-parameters + ept=[no-]exec-sp` to switch between fast and secure. + + * When switching from secure to fast, preexisting HVM domains will run + at their current performance until they are rebooted; new domains will + run without any overhead. + + * When switching from fast to secure, all HVM domains will immediately + suffer a performance penalty. + + **Warning: No guarantee is made that this runtime option will be retained + indefinitely, or that it will retain this exact behaviour. It is + intended as an emergency option for people who first chose fast, then + change their minds to secure, and wish not to reboot.** + ### extra\_guest\_irqs > `= [][,]` @@ -865,6 +890,22 @@ Note that specifying zero as domU value means zero, while for dom0 it means to use the default. +### xsm +> `= dummy | flask | silo` + +> Default: `dummy` + +Specify which XSM module should be enabled. This option is only available if +the hypervisor was compiled with XSM support. + +* `dummy`: this is the default choice. Basic restriction for common deployment + (the dummy module) will be applied. It's also used when XSM is compiled out. +* `flask`: this is the policy based access control. To choose this, the + separated option in kconfig must also be enabled. +* `silo`: this will deny any unmediated communication channels between + unprivileged VMs. To choose this, the separated option in kconfig must also + be enabled. + ### flask > `= permissive | enforcing | late | disabled` @@ -1096,7 +1137,7 @@ > Default: `new` unless directed-EOI is supported ### iommu -> `= List of [ | force | required | intremap | intpost | qinval | snoop | sharept | dom0-passthrough | dom0-strict | amd-iommu-perdev-intremap | workaround_bios_bug | igfx | crash-disable | verbose | debug ]` +> `= List of [ | force | required | quarantine | intremap | intpost | qinval | snoop | sharept | dom0-passthrough | dom0-strict | amd-iommu-perdev-intremap | workaround_bios_bug | igfx | crash-disable | verbose | debug ]` > Sub-options: @@ -1116,6 +1157,15 @@ >> Don't continue booting unless IOMMU support is found and can be initialized >> successfully. +> `quarantine` + +> Default: `true` + +>> Control Xen's behavior when de-assigning devices from guests. If enabled, +>> Xen always quarantines such devices; they must be explicitly assigned back +>> to Dom0 before they can be used there again. If disabled, Xen will only +>> quarantine devices the toolstack hass arranged for getting quarantined. + > `intremap` > Default: `true` @@ -1815,7 +1865,7 @@ An overall boolean value, `spec-ctrl=no`, can be specified to turn off all mitigations, including pieces of infrastructure used to virtualise certain mitigation features for guests. This also includes settings which `xpti`, -`smt`, `pv-l1tf` control, unless the respective option(s) have been +`smt`, `pv-l1tf`, `tsx` control, unless the respective option(s) have been specified earlier on the command line. Alternatively, a slightly more restricted `spec-ctrl=no-xen` can be used to @@ -1922,6 +1972,33 @@ ### tsc (x86) > `= unstable | skewed | stable:socket` +### tsx + = + + Applicability: x86 + Default: false on parts vulnerable to TAA, true otherwise + +Controls for the use of Transactional Synchronization eXtensions. + +On Intel parts released in Q3 2019 (with updated microcode), and future parts, +a control has been introduced which allows TSX to be turned off. + +On systems with the ability to turn TSX off, this boolean offers system wide +control of whether TSX is enabled or disabled. + +On parts vulnerable to CVE-2019-11135 / TSX Asynchronous Abort, the following +logic applies: + + * An explicit `tsx=` choice is honoured, even if it is `true` and would + result in a vulnerable system. + + * When no explicit `tsx=` choice is given, parts vulnerable to TAA will be + mitigated by disabling TSX, as this is the lowest overhead option. + + * If the use of TSX is important, the more expensive TAA mitigations can be + opted in to with `smt=0 spec-ctrl=md-clear`, at which point TSX will remain + active by default. + ### ucode (x86) > `= [ | scan]` diff -Nru xen-4.11.1+92-g6c33308a8d/tools/libxc/xc_cpuid_x86.c xen-4.11.3+24-g14b62ab3e5/tools/libxc/xc_cpuid_x86.c --- xen-4.11.1+92-g6c33308a8d/tools/libxc/xc_cpuid_x86.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/tools/libxc/xc_cpuid_x86.c 2019-12-11 14:35:39.000000000 +0000 @@ -335,7 +335,7 @@ { case 0x00000002: case 0x00000004: - regs[0] = regs[1] = regs[2] = 0; + regs[0] = regs[1] = regs[2] = regs[3] = 0; break; case 0x80000000: @@ -345,11 +345,25 @@ case 0x80000008: /* - * ECX[15:12] is ApicIdCoreSize: ECX[7:0] is NumberOfCores (minus one). - * Update to reflect vLAPIC_ID = vCPU_ID * 2. + * ECX[15:12] is ApicIdCoreSize. + * ECX[7:0] is NumberOfCores (minus one). + * Update to reflect vLAPIC_ID = vCPU_ID * 2. But make sure to avoid + * - overflow, + * - going out of sync with leaf 1 EBX[23:16], + * - incrementing ApicIdCoreSize when it's zero (which changes the + * meaning of bits 7:0). + * + * UPDATE: In addition to avoiding overflow, some + * proprietary operating systems have trouble with + * apic_id_size values greater than 7. Limit the value to + * 7 for now. */ - regs[2] = ((regs[2] + (1u << 12)) & 0xf000u) | - ((regs[2] & 0xffu) << 1) | 1u; + if ( (regs[2] & 0xffu) < 0x7fu ) + { + if ( (regs[2] & 0xf000u) && (regs[2] & 0xf000u) < 0x7000u ) + regs[2] = ((regs[2] + 0x1000u) & 0xf000u) | (regs[2] & 0xffu); + regs[2] = (regs[2] & 0xf000u) | ((regs[2] & 0x7fu) << 1) | 1u; + } break; case 0x8000000a: { @@ -430,9 +444,13 @@ case 0x00000001: /* * EBX[23:16] is Maximum Logical Processors Per Package. - * Update to reflect vLAPIC_ID = vCPU_ID * 2. + * Update to reflect vLAPIC_ID = vCPU_ID * 2, but make sure to avoid + * overflow. */ - regs[1] = (regs[1] & 0x0000ffffu) | ((regs[1] & 0x007f0000u) << 1); + if ( !(regs[1] & 0x00800000u) ) + regs[1] = (regs[1] & 0x0000ffffu) | ((regs[1] & 0x007f0000u) << 1); + else + regs[1] &= 0x00ffffffu; regs[2] = info->featureset[featureword_of(X86_FEATURE_SSE3)]; regs[3] = (info->featureset[featureword_of(X86_FEATURE_FPU)] | diff -Nru xen-4.11.1+92-g6c33308a8d/tools/libxl/libxl_pci.c xen-4.11.3+24-g14b62ab3e5/tools/libxl/libxl_pci.c --- xen-4.11.1+92-g6c33308a8d/tools/libxl/libxl_pci.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/tools/libxl/libxl_pci.c 2019-12-11 14:35:39.000000000 +0000 @@ -754,6 +754,7 @@ libxl_device_pci *pcidev, int rebind) { + libxl_ctx *ctx = libxl__gc_owner(gc); unsigned dom, bus, dev, func; char *spath, *driver_path = NULL; int rc; @@ -779,7 +780,7 @@ } if ( rc ) { LOG(WARN, PCI_BDF" already assigned to pciback", dom, bus, dev, func); - return 0; + goto quarantine; } /* Check to see if there's already a driver that we need to unbind from */ @@ -810,6 +811,19 @@ return ERROR_FAIL; } +quarantine: + /* + * DOMID_IO is just a sentinel domain, without any actual mappings, + * so always pass XEN_DOMCTL_DEV_RDM_RELAXED to avoid assignment being + * unnecessarily denied. + */ + rc = xc_assign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev), + XEN_DOMCTL_DEV_RDM_RELAXED); + if ( rc < 0 ) { + LOG(ERROR, "failed to quarantine "PCI_BDF, dom, bus, dev, func); + return ERROR_FAIL; + } + return 0; } @@ -817,9 +831,18 @@ libxl_device_pci *pcidev, int rebind) { + libxl_ctx *ctx = libxl__gc_owner(gc); int rc; char *driver_path; + /* De-quarantine */ + rc = xc_deassign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev)); + if ( rc < 0 ) { + LOG(ERROR, "failed to de-quarantine "PCI_BDF, pcidev->domain, pcidev->bus, + pcidev->dev, pcidev->func); + return ERROR_FAIL; + } + /* Unbind from pciback */ if ( (rc=pciback_dev_is_assigned(gc, pcidev)) < 0 ) { return ERROR_FAIL; diff -Nru xen-4.11.1+92-g6c33308a8d/tools/tests/vpci/emul.h xen-4.11.3+24-g14b62ab3e5/tools/tests/vpci/emul.h --- xen-4.11.1+92-g6c33308a8d/tools/tests/vpci/emul.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/tools/tests/vpci/emul.h 2019-12-11 14:35:39.000000000 +0000 @@ -92,6 +92,9 @@ #define xfree(p) free(p) #define pci_get_pdev_by_domain(...) &test_pdev +#define pci_get_ro_map(...) NULL + +#define test_bit(...) false /* Dummy native helpers. Writes are ignored, reads return 1's. */ #define pci_conf_read8(...) 0xff diff -Nru xen-4.11.1+92-g6c33308a8d/tools/xentrace/xenctx.c xen-4.11.3+24-g14b62ab3e5/tools/xentrace/xenctx.c --- xen-4.11.1+92-g6c33308a8d/tools/xentrace/xenctx.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/tools/xentrace/xenctx.c 2019-12-11 14:35:39.000000000 +0000 @@ -598,6 +598,8 @@ printf("r12_fiq: %08"PRIx32"\n", regs->r12_fiq); printf("\n"); + /* SCTLR is always 32-bit */ + printf("SCTLR: %08"PRIx32"\n", (uint32_t)ctx->sctlr); } #ifdef __aarch64__ @@ -659,6 +661,7 @@ printf("x28: %016"PRIx64"\t", regs->x28); printf("x29: %016"PRIx64"\n", regs->x29); printf("\n"); + printf("SCTLR_EL1: %016"PRIx64"\n", ctx->sctlr); } #endif /* __aarch64__ */ @@ -675,7 +678,6 @@ print_ctx_32(ctx); #endif - printf("SCTLR: %08"PRIx32"\n", ctx->sctlr); printf("TTBCR: %016"PRIx64"\n", ctx->ttbcr); printf("TTBR0: %016"PRIx64"\n", ctx->ttbr0); printf("TTBR1: %016"PRIx64"\n", ctx->ttbr1); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/Makefile xen-4.11.3+24-g14b62ab3e5/xen/Makefile --- xen-4.11.1+92-g6c33308a8d/xen/Makefile 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/Makefile 2019-12-11 14:35:39.000000000 +0000 @@ -2,7 +2,7 @@ # All other places this is stored (eg. compile.h) should be autogenerated. export XEN_VERSION = 4 export XEN_SUBVERSION = 11 -export XEN_EXTRAVERSION ?= .2-pre$(XEN_VENDORVERSION) +export XEN_EXTRAVERSION ?= .4-pre$(XEN_VENDORVERSION) export XEN_FULLVERSION = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION) -include xen-version diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/Makefile xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/Makefile --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/Makefile 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/Makefile 2019-12-11 14:35:39.000000000 +0000 @@ -21,6 +21,7 @@ obj-$(CONFIG_HAS_ITS) += gic-v3-its.o obj-$(CONFIG_HAS_ITS) += gic-v3-lpi.o obj-y += guestcopy.o +obj-y += guest_atomics.o obj-y += guest_walk.o obj-y += hvm.o obj-y += io.o diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/README.LinuxPrimitives xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/README.LinuxPrimitives --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/README.LinuxPrimitives 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/README.LinuxPrimitives 2019-12-11 14:35:39.000000000 +0000 @@ -8,7 +8,6 @@ bitops: last sync @ v3.16-rc6 (last commit: 8715466b6027) -linux/arch/arm64/lib/bitops.S xen/arch/arm/arm64/lib/bitops.S linux/arch/arm64/include/asm/bitops.h xen/include/asm-arm/arm64/bitops.h --------------------------------------------------------------------- @@ -69,19 +68,9 @@ bitops: last sync @ v3.16-rc6 (last commit: c32ffce0f66e) -linux/arch/arm/lib/bitops.h xen/arch/arm/arm32/lib/bitops.h -linux/arch/arm/lib/changebit.S xen/arch/arm/arm32/lib/changebit.S -linux/arch/arm/lib/clearbit.S xen/arch/arm/arm32/lib/clearbit.S linux/arch/arm/lib/findbit.S xen/arch/arm/arm32/lib/findbit.S -linux/arch/arm/lib/setbit.S xen/arch/arm/arm32/lib/setbit.S -linux/arch/arm/lib/testchangebit.S xen/arch/arm/arm32/lib/testchangebit.S -linux/arch/arm/lib/testclearbit.S xen/arch/arm/arm32/lib/testclearbit.S -linux/arch/arm/lib/testsetbit.S xen/arch/arm/arm32/lib/testsetbit.S - -for i in bitops.h changebit.S clearbit.S findbit.S setbit.S testchangebit.S \ - testclearbit.S testsetbit.S; do - diff -u ../linux/arch/arm/lib/$i xen/arch/arm/arm32/lib/$i; -done + +diff -u ../linux/arch/arm/lib/findbit.S xen/arch/arm/arm32/lib/findbit.S --------------------------------------------------------------------- diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/entry.S xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/entry.S --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/entry.S 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/entry.S 2019-12-11 14:35:39.000000000 +0000 @@ -3,6 +3,17 @@ #include #include +/* + * Short-hands to defined the interrupts (A, I, F) + * + * _ means the interrupt state will not change + * X means the state of interrupt X will change + * + * To be used with msr cpsr_* only + */ +#define IFLAGS_AIF PSR_ABT_MASK | PSR_IRQ_MASK | PSR_FIQ_MASK +#define IFLAGS_A_F PSR_ABT_MASK | PSR_FIQ_MASK + #define SAVE_ONE_BANKED(reg) mrs r11, reg; str r11, [sp, #UREGS_##reg] #define RESTORE_ONE_BANKED(reg) ldr r11, [sp, #UREGS_##reg]; msr reg, r11 @@ -12,27 +23,6 @@ #define RESTORE_BANKED(mode) \ RESTORE_ONE_BANKED(SP_##mode) ; RESTORE_ONE_BANKED(LR_##mode) ; RESTORE_ONE_BANKED(SPSR_##mode) -#define SAVE_ALL \ - sub sp, #(UREGS_SP_usr - UREGS_sp); /* SP, LR, SPSR, PC */ \ - push {r0-r12}; /* Save R0-R12 */ \ - \ - mrs r11, ELR_hyp; /* ELR_hyp is return address. */\ - str r11, [sp, #UREGS_pc]; \ - \ - str lr, [sp, #UREGS_lr]; \ - \ - add r11, sp, #UREGS_kernel_sizeof+4; \ - str r11, [sp, #UREGS_sp]; \ - \ - mrc CP32(r11, HSR); /* Save exception syndrome */ \ - str r11, [sp, #UREGS_hsr]; \ - \ - mrs r11, SPSR_hyp; \ - str r11, [sp, #UREGS_cpsr]; \ - and r11, #PSR_MODE_MASK; \ - cmp r11, #PSR_MODE_HYP; \ - blne save_guest_regs - save_guest_regs: #ifdef CONFIG_ARM32_HARDEN_BRANCH_PREDICTOR /* @@ -51,7 +41,7 @@ ldr r11, =0xffffffff /* Clobber SP which is only valid for hypervisor frames. */ str r11, [sp, #UREGS_sp] SAVE_ONE_BANKED(SP_usr) - /* LR_usr is the same physical register as lr and is saved in SAVE_ALL */ + /* LR_usr is the same physical register as lr and is saved by the caller */ SAVE_BANKED(svc) SAVE_BANKED(abt) SAVE_BANKED(und) @@ -125,33 +115,79 @@ skip_check: mov pc, lr -/* - * Macro to define trap entry. The iflags corresponds to the list of - * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask. - */ -#define __DEFINE_TRAP_ENTRY(trap, iflags) \ - ALIGN; \ -trap_##trap: \ - SAVE_ALL; \ - cpsie iflags; \ - adr lr, return_from_trap; \ - mov r0, sp; \ - /* \ - * Save the stack pointer in r11. It will be restored after the \ - * trap has been handled (see return_from_trap). \ - */ \ - mov r11, sp; \ - bic sp, #7; /* Align the stack pointer (noop on guest trap) */ \ - b do_trap_##trap + /* + * Macro to define a trap entry. + * + * @guest_iflags: Optional list of interrupts to unmask when + * entering from guest context. As this is used with cpsie, + * the letter (a, i, f) should be used. + * + * @hyp_iflags: Optional list of interrupts to inherit when + * entering from hypervisor context. Any interrupts not + * listed will be kept unchanged. As this is used with cpsr_*, + * IFLAGS_* short-hands should be used. + */ + .macro vector trap, guest_iflags=n, hyp_iflags=0 + /* Save registers in the stack */ + sub sp, #(UREGS_SP_usr - UREGS_sp) /* SP, LR, SPSR, PC */ + push {r0-r12} /* Save R0-R12 */ + mrs r11, ELR_hyp /* ELR_hyp is return address */ + str r11, [sp, #UREGS_pc] -/* Trap handler which unmask IRQ/Abort, keep FIQ masked */ -#define DEFINE_TRAP_ENTRY(trap) __DEFINE_TRAP_ENTRY(trap, ai) + str lr, [sp, #UREGS_lr] -/* Trap handler which unmask Abort, keep IRQ/FIQ masked */ -#define DEFINE_TRAP_ENTRY_NOIRQ(trap) __DEFINE_TRAP_ENTRY(trap, a) + add r11, sp, #(UREGS_kernel_sizeof + 4) -/* Trap handler which unmask IRQ, keep Abort/FIQ masked */ -#define DEFINE_TRAP_ENTRY_NOABORT(trap) __DEFINE_TRAP_ENTRY(trap, i) + str r11, [sp, #UREGS_sp] + + mrc CP32(r11, HSR) /* Save exception syndrome */ + str r11, [sp, #UREGS_hsr] + + mrs r11, SPSR_hyp + str r11, [sp, #UREGS_cpsr] + + /* + * We need to distinguish whether we came from guest or + * hypervisor context. + */ + and r0, r11, #PSR_MODE_MASK + cmp r0, #PSR_MODE_HYP + + bne 1f + /* + * Trap from the hypervisor + * + * Inherit the state of the interrupts from the hypervisor + * context. For that we need to use SPSR (stored in r11) and + * modify CPSR accordingly. + * + * CPSR = (CPSR & ~hyp_iflags) | (SPSR & hyp_iflags) + */ + mrs r10, cpsr + bic r10, r10, #\hyp_iflags + and r11, r11, #\hyp_iflags + orr r10, r10, r11 + msr cpsr_cx, r10 + b 2f + +1: + /* Trap from the guest */ + bl save_guest_regs + .if \guest_iflags != n + cpsie \guest_iflags + .endif +2: + /* We are ready to handle the trap, setup the registers and jump. */ + adr lr, return_from_trap + mov r0, sp + /* + * Save the stack pointer in r11. It will be restored after the + * trap has been handled (see return_from_trap). + */ + mov r11, sp + bic sp, #7 /* Align the stack pointer (noop on guest trap) */ + b do_trap_\trap + .endm .align 5 GLOBAL(hyp_traps_vector) @@ -223,14 +259,62 @@ #endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */ -DEFINE_TRAP_ENTRY(reset) -DEFINE_TRAP_ENTRY(undefined_instruction) -DEFINE_TRAP_ENTRY(hypervisor_call) -DEFINE_TRAP_ENTRY(prefetch_abort) -DEFINE_TRAP_ENTRY(guest_sync) -DEFINE_TRAP_ENTRY_NOIRQ(irq) -DEFINE_TRAP_ENTRY_NOIRQ(fiq) -DEFINE_TRAP_ENTRY_NOABORT(data_abort) +/* Vector not used by the Hypervisor. */ +trap_reset: + vector reset + +/* + * Vector only used by the Hypervisor. + * + * While the exception can be executed with all the interrupts (e.g. + * IRQ) unmasked, the interrupted context may have purposefully masked + * some of them. So we want to inherit the state from the interrupted + * context. + */ +trap_undefined_instruction: + vector undefined_instruction, hyp_iflags=IFLAGS_AIF + +/* We should never reach this trap */ +trap_hypervisor_call: + vector hypervisor_call + +/* + * Vector only used by the hypervisor. + * + * While the exception can be executed with all the interrupts (e.g. + * IRQ) unmasked, the interrupted context may have purposefully masked + * some of them. So we want to inherit the state from the interrupted + * context. + */ +trap_prefetch_abort: + vector prefetch_abort, hyp_iflags=IFLAGS_AIF + +/* + * Vector only used by the hypervisor. + * + * Data Abort should be rare and most likely fatal. It is best to not + * unmask any interrupts to limit the amount of code that can run before + * the Data Abort is treated. + */ +trap_data_abort: + vector data_abort + +/* Vector only used by the guest. We can unmask Abort/IRQ. */ +trap_guest_sync: + vector guest_sync, guest_iflags=ai + + +/* Vector used by the hypervisor and the guest. */ +trap_irq: + vector irq, guest_iflags=a, hyp_iflags=IFLAGS_A_F + +/* + * Vector used by the hypervisor and the guest. + * + * FIQ are not meant to happen, so we don't unmask any interrupts. + */ +trap_fiq: + vector fiq return_from_trap: /* diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/lib/Makefile xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/lib/Makefile --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/lib/Makefile 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/lib/Makefile 2019-12-11 14:35:39.000000000 +0000 @@ -1,6 +1,5 @@ obj-y += memcpy.o memmove.o memset.o memchr.o memzero.o -obj-y += findbit.o setbit.o -obj-y += setbit.o clearbit.o changebit.o -obj-y += testsetbit.o testclearbit.o testchangebit.o +obj-y += findbit.o +obj-y += bitops.o obj-y += strchr.o strrchr.o obj-y += lib1funcs.o lshrdi3.o div64.o diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/lib/bitops.c xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/lib/bitops.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/lib/bitops.c 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/lib/bitops.c 2019-12-11 14:35:39.000000000 +0000 @@ -0,0 +1,171 @@ +/* + * Copyright (C) 2018 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include + +/* + * The atomic bit operations pass the number of bit in a signed number + * (not sure why). This has the drawback to increase the complexity of + * the resulting assembly. + * + * To generate simpler code, the number of bit (nr) will be cast to + * unsigned int. + * + * XXX: Rework the interface to use unsigned int. + */ + +#define bitop(name, instr) \ +static always_inline bool int_##name(int nr, volatile void *p, bool timeout,\ + unsigned int max_try) \ +{ \ + volatile uint32_t *ptr = (uint32_t *)p + BIT_WORD((unsigned int)nr); \ + const uint32_t mask = BIT_MASK((unsigned int)nr); \ + unsigned long res, tmp; \ + \ + ASSERT(((vaddr_t)p & 0x3) == 0); \ + prefetchw((const void *)ptr); \ + \ + do \ + { \ + asm volatile ("// " __stringify(name) "\n" \ + " ldrex %2, %1\n" \ + " " __stringify(instr) " %2, %2, %3\n" \ + " strex %0, %2, %1\n" \ + : "=&r" (res), "+Qo" (*ptr), "=&r" (tmp) \ + : "r" (mask)); \ + \ + if ( !res ) \ + break; \ + } while ( !timeout || ((--max_try) > 0) ); \ + \ + return !res; \ +} \ + \ +void name(int nr, volatile void *p) \ +{ \ + if ( !int_##name(nr, p, false, 0) ) \ + ASSERT_UNREACHABLE(); \ +} \ + \ +bool name##_timeout(int nr, volatile void *p, unsigned int max_try) \ +{ \ + return int_##name(nr, p, true, max_try); \ +} + +#define testop(name, instr) \ +static always_inline bool int_##name(int nr, volatile void *p, int *oldbit, \ + bool timeout, unsigned int max_try) \ +{ \ + volatile uint32_t *ptr = (uint32_t *)p + BIT_WORD((unsigned int)nr); \ + unsigned int bit = (unsigned int)nr % BITS_PER_WORD; \ + const uint32_t mask = BIT_MASK(bit); \ + unsigned long res, tmp; \ + \ + ASSERT(((vaddr_t)p & 0x3) == 0); \ + smp_mb(); \ + \ + prefetchw((const void *)ptr); \ + \ + do \ + { \ + asm volatile ("// " __stringify(name) "\n" \ + " ldrex %3, %2\n" \ + " lsr %1, %3, %5 // Save old value of bit\n" \ + " " __stringify(instr) " %3, %3, %4 // Toggle bit\n" \ + " strex %0, %3, %2\n" \ + : "=&r" (res), "=&r" (*oldbit), "+Qo" (*ptr), "=&r" (tmp) \ + : "r" (mask), "r" (bit)); \ + \ + if ( !res ) \ + break; \ + } while ( !timeout || ((--max_try) > 0) ); \ + \ + smp_mb(); \ + \ + *oldbit &= 1; \ + \ + return !res; \ +} \ + \ +int name(int nr, volatile void *p) \ +{ \ + int oldbit; \ + \ + if ( !int_##name(nr, p, &oldbit, false, 0) ) \ + ASSERT_UNREACHABLE(); \ + \ + return oldbit; \ +} \ + \ +bool name##_timeout(int nr, volatile void *p, \ + int *oldbit, unsigned int max_try) \ +{ \ + return int_##name(nr, p, oldbit, true, max_try); \ +} + +bitop(change_bit, eor) +bitop(clear_bit, bic) +bitop(set_bit, orr) + +testop(test_and_change_bit, eor) +testop(test_and_clear_bit, bic) +testop(test_and_set_bit, orr) + +static always_inline bool int_clear_mask16(uint16_t mask, volatile uint16_t *p, + bool timeout, unsigned int max_try) +{ + unsigned long res, tmp; + + prefetchw((const uint16_t *)p); + + do + { + asm volatile ("// int_clear_mask16\n" + " ldrexh %2, %1\n" + " bic %2, %2, %3\n" + " strexh %0, %2, %1\n" + : "=&r" (res), "+Qo" (*p), "=&r" (tmp) + : "r" (mask)); + + if ( !res ) + break; + } while ( !timeout || ((--max_try) > 0) ); + + return !res; +} + +void clear_mask16(uint16_t mask, volatile void *p) +{ + if ( !int_clear_mask16(mask, p, false, 0) ) + ASSERT_UNREACHABLE(); +} + +bool clear_mask16_timeout(uint16_t mask, volatile void *p, + unsigned int max_try) +{ + return int_clear_mask16(mask, p, true, max_try); +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/lib/bitops.h xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/lib/bitops.h --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/lib/bitops.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/lib/bitops.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,104 +0,0 @@ - -#if __LINUX_ARM_ARCH__ >= 6 - .macro bitop, name, instr -ENTRY( \name ) -UNWIND( .fnstart ) - ands ip, r1, #3 - strneb r1, [ip] @ assert word-aligned - mov r2, #1 - and r3, r0, #31 @ Get bit offset - mov r0, r0, lsr #5 - add r1, r1, r0, lsl #2 @ Get word offset -#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP) - .arch_extension mp - ALT_SMP(W(pldw) [r1]) - ALT_UP(W(nop)) -#endif - mov r3, r2, lsl r3 -1: ldrex r2, [r1] - \instr r2, r2, r3 - strex r0, r2, [r1] - cmp r0, #0 - bne 1b - bx lr -UNWIND( .fnend ) -ENDPROC(\name ) - .endm - - .macro testop, name, instr, store -ENTRY( \name ) -UNWIND( .fnstart ) - ands ip, r1, #3 - strneb r1, [ip] @ assert word-aligned - mov r2, #1 - and r3, r0, #31 @ Get bit offset - mov r0, r0, lsr #5 - add r1, r1, r0, lsl #2 @ Get word offset - mov r3, r2, lsl r3 @ create mask - smp_dmb -#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP) - .arch_extension mp - ALT_SMP(W(pldw) [r1]) - ALT_UP(W(nop)) -#endif -1: ldrex r2, [r1] - ands r0, r2, r3 @ save old value of bit - \instr r2, r2, r3 @ toggle bit - strex ip, r2, [r1] - cmp ip, #0 - bne 1b - smp_dmb - cmp r0, #0 - movne r0, #1 -2: bx lr -UNWIND( .fnend ) -ENDPROC(\name ) - .endm -#else - .macro bitop, name, instr -ENTRY( \name ) -UNWIND( .fnstart ) - ands ip, r1, #3 - strneb r1, [ip] @ assert word-aligned - and r2, r0, #31 - mov r0, r0, lsr #5 - mov r3, #1 - mov r3, r3, lsl r2 - save_and_disable_irqs ip - ldr r2, [r1, r0, lsl #2] - \instr r2, r2, r3 - str r2, [r1, r0, lsl #2] - restore_irqs ip - mov pc, lr -UNWIND( .fnend ) -ENDPROC(\name ) - .endm - -/** - * testop - implement a test_and_xxx_bit operation. - * @instr: operational instruction - * @store: store instruction - * - * Note: we can trivially conditionalise the store instruction - * to avoid dirtying the data cache. - */ - .macro testop, name, instr, store -ENTRY( \name ) -UNWIND( .fnstart ) - ands ip, r1, #3 - strneb r1, [ip] @ assert word-aligned - and r3, r0, #31 - mov r0, r0, lsr #5 - save_and_disable_irqs ip - ldr r2, [r1, r0, lsl #2]! - mov r0, #1 - tst r2, r0, lsl r3 - \instr r2, r2, r0, lsl r3 - \store r2, [r1] - moveq r0, #0 - restore_irqs ip - mov pc, lr -UNWIND( .fnend ) -ENDPROC(\name ) - .endm -#endif diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/lib/changebit.S xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/lib/changebit.S --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/lib/changebit.S 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/lib/changebit.S 1970-01-01 00:00:00.000000000 +0000 @@ -1,14 +0,0 @@ -/* - * linux/arch/arm/lib/changebit.S - * - * Copyright (C) 1995-1996 Russell King - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include "assembler.h" -#include "bitops.h" - .text - -bitop _change_bit, eor diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/lib/clearbit.S xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/lib/clearbit.S --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/lib/clearbit.S 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/lib/clearbit.S 1970-01-01 00:00:00.000000000 +0000 @@ -1,14 +0,0 @@ -/* - * linux/arch/arm/lib/clearbit.S - * - * Copyright (C) 1995-1996 Russell King - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include "assembler.h" -#include "bitops.h" - .text - -bitop _clear_bit, bic diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/lib/findbit.S xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/lib/findbit.S --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/lib/findbit.S 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/lib/findbit.S 2019-12-11 14:35:39.000000000 +0000 @@ -42,8 +42,8 @@ * Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset) */ ENTRY(_find_next_zero_bit_le) - teq r1, #0 - beq 3b + cmp r1, r2 + bls 3b ands ip, r2, #7 beq 1b @ If new byte, goto old routine ARM( ldrb r3, [r0, r2, lsr #3] ) @@ -83,8 +83,8 @@ * Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset) */ ENTRY(_find_next_bit_le) - teq r1, #0 - beq 3b + cmp r1, r2 + bls 3b ands ip, r2, #7 beq 1b @ If new byte, goto old routine ARM( ldrb r3, [r0, r2, lsr #3] ) @@ -117,8 +117,8 @@ ENDPROC(_find_first_zero_bit_be) ENTRY(_find_next_zero_bit_be) - teq r1, #0 - beq 3b + cmp r1, r2 + bls 3b ands ip, r2, #7 beq 1b @ If new byte, goto old routine eor r3, r2, #0x18 @ big endian byte ordering @@ -151,8 +151,8 @@ ENDPROC(_find_first_bit_be) ENTRY(_find_next_bit_be) - teq r1, #0 - beq 3b + cmp r1, r2 + bls 3b ands ip, r2, #7 beq 1b @ If new byte, goto old routine eor r3, r2, #0x18 @ big endian byte ordering diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/lib/setbit.S xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/lib/setbit.S --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/lib/setbit.S 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/lib/setbit.S 1970-01-01 00:00:00.000000000 +0000 @@ -1,15 +0,0 @@ -/* - * linux/arch/arm/lib/setbit.S - * - * Copyright (C) 1995-1996 Russell King - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include "assembler.h" -#include "bitops.h" - .text - -bitop _set_bit, orr diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/lib/testchangebit.S xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/lib/testchangebit.S --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/lib/testchangebit.S 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/lib/testchangebit.S 1970-01-01 00:00:00.000000000 +0000 @@ -1,15 +0,0 @@ -/* - * linux/arch/arm/lib/testchangebit.S - * - * Copyright (C) 1995-1996 Russell King - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include "assembler.h" -#include "bitops.h" - .text - -testop _test_and_change_bit, eor, str diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/lib/testclearbit.S xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/lib/testclearbit.S --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/lib/testclearbit.S 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/lib/testclearbit.S 1970-01-01 00:00:00.000000000 +0000 @@ -1,15 +0,0 @@ -/* - * linux/arch/arm/lib/testclearbit.S - * - * Copyright (C) 1995-1996 Russell King - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include "assembler.h" -#include "bitops.h" - .text - -testop _test_and_clear_bit, bicne, strne diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/lib/testsetbit.S xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/lib/testsetbit.S --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm32/lib/testsetbit.S 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm32/lib/testsetbit.S 1970-01-01 00:00:00.000000000 +0000 @@ -1,15 +0,0 @@ -/* - * linux/arch/arm/lib/testsetbit.S - * - * Copyright (C) 1995-1996 Russell King - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include "assembler.h" -#include "bitops.h" - .text - -testop _test_and_set_bit, orreq, streq diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm64/entry.S xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm64/entry.S --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm64/entry.S 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm64/entry.S 2019-12-11 14:35:39.000000000 +0000 @@ -191,24 +191,63 @@ entry hyp=1 invalid BAD_ERROR +/* + * SError received while running in the hypervisor mode. + * + * Technically, we could unmask the IRQ if it were unmasked in the + * interrupted context. However, this require to check the PSTATE. For + * simplicity, as SError should be rare and potentially fatal, + * all interrupts are kept masked. + */ hyp_error: entry hyp=1 - msr daifclr, #2 mov x0, sp bl do_trap_hyp_serror exit hyp=1 -/* Traps taken in Current EL with SP_ELx */ +/* + * Synchronous exception received while running in the hypervisor mode. + * + * While the exception could be executed with all the interrupts (e.g. + * IRQ) unmasked, the interrupted context may have purposefully masked + * some of them. So we want to inherit the state from the interrupted + * context. + */ hyp_sync: entry hyp=1 - msr daifclr, #6 + + /* Inherit interrupts */ + mrs x0, SPSR_el2 + and x0, x0, #(PSR_DBG_MASK | PSR_ABT_MASK | PSR_IRQ_MASK | PSR_FIQ_MASK) + msr daif, x0 + mov x0, sp bl do_trap_hyp_sync exit hyp=1 +/* + * IRQ received while running in the hypervisor mode. + * + * While the exception could be executed with all the interrupts but IRQ + * unmasked, the interrupted context may have purposefully masked some + * of them. So we want to inherit the state from the interrupt context + * and keep IRQ masked. + * + * XXX: We may want to consider an ordering between interrupts (e.g. if + * SError are masked, then IRQ should be masked too). However, this + * would require some rework in some paths (e.g. panic, livepatch) to + * ensure the ordering is enforced everywhere. + */ hyp_irq: entry hyp=1 - msr daifclr, #4 + + /* Inherit D, A, F interrupts and keep I masked */ + mrs x0, SPSR_el2 + mov x1, #(PSR_DBG_MASK | PSR_ABT_MASK | PSR_FIQ_MASK) + and x0, x0, x1 + orr x0, x0, #PSR_IRQ_MASK + msr daif, x0 + mov x0, sp bl do_trap_irq exit hyp=1 diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm64/head.S xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm64/head.S --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm64/head.S 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm64/head.S 2019-12-11 14:35:39.000000000 +0000 @@ -32,6 +32,13 @@ #define PT_DEV 0xe71 /* nG=1 AF=1 SH=10 AP=01 NS=1 ATTR=100 T=0 P=1 */ #define PT_DEV_L3 0xe73 /* nG=1 AF=1 SH=10 AP=01 NS=1 ATTR=100 T=1 P=1 */ +#define __HEAD_FLAG_PAGE_SIZE ((PAGE_SHIFT - 10) / 2) + +#define __HEAD_FLAG_PHYS_BASE 1 + +#define __HEAD_FLAGS ((__HEAD_FLAG_PAGE_SIZE << 1) | \ + (__HEAD_FLAG_PHYS_BASE << 3)) + #if (defined (CONFIG_EARLY_PRINTK)) && (defined (EARLY_PRINTK_INC)) #include EARLY_PRINTK_INC #endif @@ -120,8 +127,8 @@ add x13, x18, #0x16 b real_start /* branch to kernel start */ .quad 0 /* Image load offset from start of RAM */ - .quad 0 /* reserved */ - .quad 0 /* reserved */ + .quad _end - start /* Effective size of kernel image, little-endian */ + .quad __HEAD_FLAGS /* Informative flags, little-endian */ .quad 0 /* reserved */ .quad 0 /* reserved */ .quad 0 /* reserved */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm64/lib/bitops.S xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm64/lib/bitops.S --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm64/lib/bitops.S 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm64/lib/bitops.S 1970-01-01 00:00:00.000000000 +0000 @@ -1,67 +0,0 @@ -/* - * Based on linux/arch/arm64/lib/bitops.h which in turn is - * Based on arch/arm/lib/bitops.h - * - * Copyright (C) 2013 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -/* - * x0: bits 4:0 bit offset - * bits 31:5 word offset - * x1: address - */ - .macro bitop, name, instr -ENTRY( \name ) - and w3, w0, #31 // Get bit offset - eor w0, w0, w3 // Clear low bits - mov x2, #1 - add x1, x1, x0, lsr #3 // Get word offset - lsl x3, x2, x3 // Create mask -1: ldxr w2, [x1] - \instr w2, w2, w3 - stxr w0, w2, [x1] - cbnz w0, 1b - ret -ENDPROC(\name ) - .endm - - .macro testop, name, instr -ENTRY( \name ) - and w3, w0, #31 // Get bit offset - eor w0, w0, w3 // Clear low bits - mov x2, #1 - add x1, x1, x0, lsr #3 // Get word offset - lsl x4, x2, x3 // Create mask -1: ldxr w2, [x1] - lsr w0, w2, w3 // Save old value of bit - \instr w2, w2, w4 // toggle bit - stlxr w5, w2, [x1] - cbnz w5, 1b - dmb ish - and w0, w0, #1 -3: ret -ENDPROC(\name ) - .endm - -/* - * Atomic bit operations. - */ - bitop change_bit, eor - bitop clear_bit, bic - bitop set_bit, orr - - testop test_and_change_bit, eor - testop test_and_clear_bit, bic - testop test_and_set_bit, orr diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm64/lib/bitops.c xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm64/lib/bitops.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/arm64/lib/bitops.c 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/arm64/lib/bitops.c 2019-12-11 14:35:39.000000000 +0000 @@ -0,0 +1,161 @@ +/* + * Copyright (C) 2018 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include + +/* + * The atomic bit operations pass the number of bit in a signed number + * (not sure why). This has the drawback to increase the complexity of + * the resulting assembly. + * + * To generate simpler code, the number of bit (nr) will be cast to + * unsigned int. + * + * XXX: Rework the interface to use unsigned int. + */ + +#define bitop(name, instr) \ +static always_inline bool int_##name(int nr, volatile void *p, bool timeout,\ + unsigned int max_try) \ +{ \ + volatile uint32_t *ptr = (uint32_t *)p + BIT_WORD((unsigned int)nr); \ + const uint32_t mask = BIT_MASK((unsigned int)nr); \ + unsigned long res, tmp; \ + \ + do \ + { \ + asm volatile ("// " __stringify(name) "\n" \ + " ldxr %w2, %1\n" \ + " " __stringify(instr) " %w2, %w2, %w3\n" \ + " stxr %w0, %w2, %1\n" \ + : "=&r" (res), "+Q" (*ptr), "=&r" (tmp) \ + : "r" (mask)); \ + \ + if ( !res ) \ + break; \ + } while ( !timeout || ((--max_try) > 0) ); \ + \ + return !res; \ +} \ + \ +void name(int nr, volatile void *p) \ +{ \ + if ( !int_##name(nr, p, false, 0) ) \ + ASSERT_UNREACHABLE(); \ +} \ + \ +bool name##_timeout(int nr, volatile void *p, unsigned int max_try) \ +{ \ + return int_##name(nr, p, true, max_try); \ +} + +#define testop(name, instr) \ +static always_inline bool int_##name(int nr, volatile void *p, int *oldbit, \ + bool timeout, unsigned int max_try) \ +{ \ + volatile uint32_t *ptr = (uint32_t *)p + BIT_WORD((unsigned int)nr); \ + unsigned int bit = (unsigned int)nr % BITS_PER_WORD; \ + const uint32_t mask = BIT_MASK(bit); \ + unsigned long res, tmp; \ + \ + do \ + { \ + asm volatile ("// " __stringify(name) "\n" \ + " ldxr %w3, %2\n" \ + " lsr %w1, %w3, %w5 // Save old value of bit\n" \ + " " __stringify(instr) " %w3, %w3, %w4 // Toggle bit\n" \ + " stlxr %w0, %w3, %2\n" \ + : "=&r" (res), "=&r" (*oldbit), "+Q" (*ptr), "=&r" (tmp) \ + : "r" (mask), "r" (bit) \ + : "memory"); \ + \ + if ( !res ) \ + break; \ + } while ( !timeout || ((--max_try) > 0) ); \ + \ + dmb(ish); \ + \ + *oldbit &= 1; \ + \ + return !res; \ +} \ + \ +int name(int nr, volatile void *p) \ +{ \ + int oldbit; \ + \ + if ( !int_##name(nr, p, &oldbit, false, 0) ) \ + ASSERT_UNREACHABLE(); \ + \ + return oldbit; \ +} \ + \ +bool name##_timeout(int nr, volatile void *p, \ + int *oldbit, unsigned int max_try) \ +{ \ + return int_##name(nr, p, oldbit, true, max_try); \ +} + +bitop(change_bit, eor) +bitop(clear_bit, bic) +bitop(set_bit, orr) + +testop(test_and_change_bit, eor) +testop(test_and_clear_bit, bic) +testop(test_and_set_bit, orr) + +static always_inline bool int_clear_mask16(uint16_t mask, volatile uint16_t *p, + bool timeout, unsigned int max_try) +{ + unsigned long res, tmp; + + do + { + asm volatile ("// int_clear_mask16\n" + " ldxrh %w2, %1\n" + " bic %w2, %w2, %w3\n" + " stxrh %w0, %w2, %1\n" + : "=&r" (res), "+Q" (*p), "=&r" (tmp) + : "r" (mask)); + + if ( !res ) + break; + } while ( !timeout || ((--max_try) > 0) ); + + return !res; +} + +void clear_mask16(uint16_t mask, volatile void *p) +{ + if ( !int_clear_mask16(mask, p, false, 0) ) + ASSERT_UNREACHABLE(); +} + +bool clear_mask16_timeout(uint16_t mask, volatile void *p, + unsigned int max_try) +{ + return int_clear_mask16(mask, p, true, max_try); +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/domain.c xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/domain.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/domain.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/domain.c 2019-12-11 14:35:39.000000000 +0000 @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -273,28 +274,31 @@ static void update_runstate_area(struct vcpu *v) { void __user *guest_handle = NULL; + struct vcpu_runstate_info runstate; if ( guest_handle_is_null(runstate_guest(v)) ) return; + memcpy(&runstate, &v->runstate, sizeof(runstate)); + if ( VM_ASSIST(v->domain, runstate_update_flag) ) { guest_handle = &v->runstate_guest.p->state_entry_time + 1; guest_handle--; - v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE; + runstate.state_entry_time |= XEN_RUNSTATE_UPDATE; __raw_copy_to_guest(guest_handle, - (void *)(&v->runstate.state_entry_time + 1) - 1, 1); + (void *)(&runstate.state_entry_time + 1) - 1, 1); smp_wmb(); } - __copy_to_guest(runstate_guest(v), &v->runstate, 1); + __copy_to_guest(runstate_guest(v), &runstate, 1); if ( guest_handle ) { - v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE; + runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE; smp_wmb(); __raw_copy_to_guest(guest_handle, - (void *)(&v->runstate.state_entry_time + 1) - 1, 1); + (void *)(&runstate.state_entry_time + 1) - 1, 1); } } @@ -376,14 +380,15 @@ /* Nothing to do -- no lazy switching */ } -#define next_arg(fmt, args) ({ \ +#define NEXT_ARG(fmt, args) \ +({ \ unsigned long __arg; \ switch ( *(fmt)++ ) \ { \ case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \ case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \ case 'h': __arg = (unsigned long)va_arg(args, void *); break; \ - default: __arg = 0; BUG(); \ + default: goto bad_fmt; \ } \ __arg; \ }) @@ -398,9 +403,6 @@ unsigned int i; va_list args; - /* All hypercalls take at least one argument */ - BUG_ON( !p || *p == '\0' ); - current->hcall_preempted = true; va_start(args, format); @@ -408,7 +410,7 @@ if ( mcs->flags & MCSF_in_multicall ) { for ( i = 0; *p != '\0'; i++ ) - mcs->call.args[i] = next_arg(p, args); + mcs->call.args[i] = NEXT_ARG(p, args); /* Return value gets written back to mcs->call.result */ rc = mcs->call.result; @@ -424,7 +426,7 @@ for ( i = 0; *p != '\0'; i++ ) { - arg = next_arg(p, args); + arg = NEXT_ARG(p, args); switch ( i ) { @@ -447,7 +449,7 @@ for ( i = 0; *p != '\0'; i++ ) { - arg = next_arg(p, args); + arg = NEXT_ARG(p, args); switch ( i ) { @@ -468,8 +470,17 @@ va_end(args); return rc; + + bad_fmt: + va_end(args); + gprintk(XENLOG_ERR, "Bad hypercall continuation format '%c'\n", *p); + ASSERT_UNREACHABLE(); + domain_crash(current->domain); + return 0; } +#undef NEXT_ARG + void startup_cpu_idle_loop(void) { struct vcpu *v = current; @@ -961,7 +972,7 @@ void vcpu_mark_events_pending(struct vcpu *v) { - int already_pending = test_and_set_bit( + bool already_pending = guest_test_and_set_bit(v->domain, 0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending)); if ( already_pending ) diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/gic-v2.c xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/gic-v2.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/gic-v2.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/gic-v2.c 2019-12-11 14:35:39.000000000 +0000 @@ -353,6 +353,10 @@ type = readl_gicd(GICD_TYPER); nr_lines = 32 * ((type & GICD_TYPE_LINES) + 1); + /* Only 1020 interrupts are supported */ + nr_lines = min(1020U, nr_lines); + gicv2_info.nr_lines = nr_lines; + gic_cpus = 1 + ((type & GICD_TYPE_CPUS) >> 5); printk("GICv2: %d lines, %d cpu%s%s (IID %8.8x).\n", nr_lines, gic_cpus, (gic_cpus == 1) ? "" : "s", @@ -375,10 +379,10 @@ /* Disable all global interrupts */ for ( i = 32; i < nr_lines; i += 32 ) + { writel_gicd(~0x0, GICD_ICENABLER + (i / 32) * 4); - - /* Only 1020 interrupts are supported */ - gicv2_info.nr_lines = min(1020U, nr_lines); + writel_gicd(~0x0, GICD_ICACTIVER + (i / 32) * 4); + } /* Turn on the distributor */ writel_gicd(GICD_CTL_ENABLE, GICD_CTLR); @@ -393,6 +397,7 @@ /* The first 32 interrupts (PPI and SGI) are banked per-cpu, so * even though they are controlled with GICD registers, they must * be set up here with the other per-cpu state. */ + writel_gicd(0xffffffff, GICD_ICACTIVER); /* Diactivate PPIs and SGIs */ writel_gicd(0xffff0000, GICD_ICENABLER); /* Disable all PPI */ writel_gicd(0x0000ffff, GICD_ISENABLER); /* Enable all SGI */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/gic-v3.c xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/gic-v3.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/gic-v3.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/gic-v3.c 2019-12-11 14:35:39.000000000 +0000 @@ -609,6 +609,10 @@ if ( type & GICD_TYPE_LPIS ) gicv3_lpi_init_host_lpis(GICD_TYPE_ID_BITS(type)); + /* Only 1020 interrupts are supported */ + nr_lines = min(1020U, nr_lines); + gicv3_info.nr_lines = nr_lines; + printk("GICv3: %d lines, (IID %8.8x).\n", nr_lines, readl_relaxed(GICD + GICD_IIDR)); @@ -624,9 +628,12 @@ writel_relaxed(priority, GICD + GICD_IPRIORITYR + (i / 4) * 4); } - /* Disable all global interrupts */ + /* Disable/deactivate all global interrupts */ for ( i = NR_GIC_LOCAL_IRQS; i < nr_lines; i += 32 ) + { writel_relaxed(0xffffffff, GICD + GICD_ICENABLER + (i / 32) * 4); + writel_relaxed(0xffffffff, GICD + GICD_ICACTIVER + (i / 32) * 4); + } /* * Configure SPIs as non-secure Group-1. This will only matter @@ -648,9 +655,6 @@ for ( i = NR_GIC_LOCAL_IRQS; i < nr_lines; i++ ) writeq_relaxed(affinity, GICD + GICD_IROUTER + i * 8); - - /* Only 1020 interrupts are supported */ - gicv3_info.nr_lines = min(1020U, nr_lines); } static int gicv3_enable_redist(void) @@ -836,6 +840,11 @@ GICD_RDIST_SGI_BASE + GICR_IPRIORITYR0 + (i / 4) * 4); /* + * The activate state is unknown at boot, so make sure all + * SGIs and PPIs are de-activated. + */ + writel_relaxed(0xffffffff, GICD_RDIST_SGI_BASE + GICR_ICACTIVER0); + /* * Disable all PPI interrupts, ensure all SGI interrupts are * enabled. */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/guest_atomics.c xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/guest_atomics.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/guest_atomics.c 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/guest_atomics.c 2019-12-11 14:35:39.000000000 +0000 @@ -0,0 +1,91 @@ +/* + * arch/arm/guest_atomics.c + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; If not, see . + */ +#include + +#include + +DEFINE_PER_CPU_READ_MOSTLY(unsigned int, guest_safe_atomic_max); + +/* + * Heuristic to find a safe upper-limit for load-store exclusive + * operations on memory shared with guest. + * + * At the moment, we calculate the number of iterations of a simple + * load-store atomic loop in 1uS. + */ +static void calibrate_safe_atomic(void) +{ + s_time_t deadline = NOW() + MICROSECS(1); + unsigned int counter = 0; + unsigned long mem = 0; + + do + { + unsigned long res, tmp; + +#ifdef CONFIG_ARM_32 + asm volatile (" ldrex %2, %1\n" + " add %2, %2, #1\n" + " strex %0, %2, %1\n" + : "=&r" (res), "+Q" (mem), "=&r" (tmp)); +#else + asm volatile (" ldxr %w2, %1\n" + " add %w2, %w2, #1\n" + " stxr %w0, %w2, %1\n" + : "=&r" (res), "+Q" (mem), "=&r" (tmp)); +#endif + counter++; + } while (NOW() < deadline); + + this_cpu(guest_safe_atomic_max) = counter; + + printk(XENLOG_DEBUG + "CPU%u: Guest atomics will try %u times before pausing the domain\n", + smp_processor_id(), counter); +} + +static int cpu_guest_safe_atomic_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + if ( action == CPU_STARTING ) + calibrate_safe_atomic(); + + return NOTIFY_DONE; +} + +static struct notifier_block cpu_guest_safe_atomic_nfb = { + .notifier_call = cpu_guest_safe_atomic_callback, +}; + +static int __init guest_safe_atomic_init(void) +{ + register_cpu_notifier(&cpu_guest_safe_atomic_nfb); + + calibrate_safe_atomic(); + + return 0; +} +presmp_initcall(guest_safe_atomic_init); + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/guest_walk.c xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/guest_walk.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/guest_walk.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/guest_walk.c 2019-12-11 14:35:39.000000000 +0000 @@ -589,7 +589,7 @@ int guest_walk_tables(const struct vcpu *v, vaddr_t gva, paddr_t *ipa, unsigned int *perms) { - uint32_t sctlr = READ_SYSREG(SCTLR_EL1); + register_t sctlr = READ_SYSREG(SCTLR_EL1); register_t tcr = READ_SYSREG(TCR_EL1); unsigned int _perms; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/irq.c xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/irq.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/irq.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/irq.c 2019-12-11 14:35:39.000000000 +0000 @@ -44,7 +44,14 @@ printk("unexpected IRQ trap at irq %02x\n", irq->irq); } -static void end_none(struct irq_desc *irq) { } +static void end_none(struct irq_desc *irq) +{ + /* + * Still allow a CPU to end an interrupt if we receive a spurious + * interrupt. This will prevent the CPU to lose interrupt forever. + */ + gic_hw_ops->gic_host_irq_type->end(irq); +} hw_irq_controller no_irq_type = { .typename = "none", diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/livepatch.c xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/livepatch.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/livepatch.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/livepatch.c 2019-12-11 14:35:39.000000000 +0000 @@ -18,6 +18,11 @@ void *vmap_of_xen_text; +int arch_livepatch_safety_check(void) +{ + return 0; +} + int arch_livepatch_quiesce(void) { mfn_t text_mfn; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/mm.c xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/mm.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/mm.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/mm.c 2019-12-11 14:35:39.000000000 +0000 @@ -40,6 +40,8 @@ #include #include #include + +#include #include struct domain *dom_xen, *dom_io, *dom_cow; @@ -149,6 +151,7 @@ vaddr_t xenheap_virt_end __read_mostly; #ifdef CONFIG_ARM_64 vaddr_t xenheap_virt_start __read_mostly; +unsigned long xenheap_base_pdx __read_mostly; #endif unsigned long frametable_base_pdx __read_mostly; @@ -820,6 +823,7 @@ if ( mfn_eq(xenheap_mfn_start, INVALID_MFN) ) { xenheap_mfn_start = _mfn(base_mfn); + xenheap_base_pdx = mfn_to_pdx(_mfn(base_mfn)); xenheap_virt_start = DIRECTMAP_VIRT_START + (base_mfn - mfn) * PAGE_SIZE; } @@ -885,8 +889,8 @@ /* Map a frame table to cover physical addresses ps through pe */ void __init setup_frametable_mappings(paddr_t ps, paddr_t pe) { - unsigned long nr_pages = (pe - ps) >> PAGE_SHIFT; - unsigned long nr_pdxs = pfn_to_pdx(nr_pages); + unsigned long nr_pdxs = mfn_to_pdx(mfn_add(maddr_to_mfn(pe), -1)) - + mfn_to_pdx(maddr_to_mfn(ps)) + 1; unsigned long frametable_size = nr_pdxs * sizeof(struct page_info); mfn_t base_mfn; const unsigned long mapping_size = frametable_size < MB(32) ? MB(2) : MB(32); @@ -983,7 +987,7 @@ unsigned long nr_mfns, unsigned int flags) { - int rc; + int rc = 0; unsigned long addr = virt, addr_end = addr + nr_mfns * PAGE_SIZE; lpae_t pte, *entry; lpae_t *third = NULL; @@ -1012,7 +1016,8 @@ { printk("%s: trying to replace an existing mapping addr=%lx mfn=%"PRI_mfn"\n", __func__, addr, mfn_x(mfn)); - return -EINVAL; + rc = -EINVAL; + goto out; } if ( op == RESERVE ) break; @@ -1029,7 +1034,8 @@ { printk("%s: trying to %s a non-existing mapping addr=%lx\n", __func__, op == REMOVE ? "remove" : "modify", addr); - return -EINVAL; + rc = -EINVAL; + goto out; } if ( op == REMOVE ) pte.bits = 0; @@ -1042,7 +1048,8 @@ { printk("%s: Incorrect combination for addr=%lx\n", __func__, addr); - return -EINVAL; + rc = -EINVAL; + goto out; } } write_pte(entry, pte); @@ -1051,11 +1058,14 @@ BUG(); } } +out: + /* + * Flush the TLBs even in case of failure because we may have + * partially modified the PT. This will prevent any unexpected + * behavior afterwards. + */ flush_xen_data_tlb_range_va(virt, PAGE_SIZE * nr_mfns); - rc = 0; - -out: return rc; } @@ -1390,17 +1400,9 @@ return; } -void gnttab_clear_flag(unsigned long nr, uint16_t *addr) +void gnttab_clear_flag(struct domain *d, unsigned long nr, uint16_t *addr) { - /* - * Note that this cannot be clear_bit(), as the access must be - * confined to the specified 2 bytes. - */ - uint16_t mask = ~(1 << nr), old; - - do { - old = *addr; - } while (cmpxchg(addr, old, old & mask) != old); + guest_clear_mask16(d, BIT(nr), addr); } void gnttab_mark_dirty(struct domain *d, mfn_t mfn) diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/p2m.c xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/p2m.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/p2m.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/p2m.c 2019-12-11 14:35:39.000000000 +0000 @@ -177,21 +177,14 @@ static lpae_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn) { - unsigned int root_table; - - if ( P2M_ROOT_PAGES == 1 ) - return __map_domain_page(p2m->root); + unsigned long root_table; /* - * Concatenated root-level tables. The table number will be the - * offset at the previous level. It is not possible to - * concatenate a level-0 root. + * While the root table index is the offset from the previous level, + * we can't use (P2M_ROOT_LEVEL - 1) because the root level might be + * 0. Yet we still want to check if all the unused bits are zeroed. */ - ASSERT(P2M_ROOT_LEVEL > 0); - - root_table = gfn_x(gfn) >> (level_orders[P2M_ROOT_LEVEL - 1]); - root_table &= LPAE_ENTRY_MASK; - + root_table = gfn_x(gfn) >> (level_orders[P2M_ROOT_LEVEL] + LPAE_SHIFT); if ( root_table >= P2M_ROOT_PAGES ) return NULL; @@ -325,7 +318,12 @@ * the table should always be non-NULL because the gfn is below * p2m->max_mapped_gfn and the root table pages are always present. */ - BUG_ON(table == NULL); + if ( !table ) + { + ASSERT_UNREACHABLE(); + level = P2M_ROOT_LEVEL; + goto out; + } for ( level = P2M_ROOT_LEVEL; level < 3; level++ ) { @@ -938,10 +936,16 @@ p2m_write_pte(entry, pte, p2m->clean_pte); p2m->max_mapped_gfn = gfn_max(p2m->max_mapped_gfn, - gfn_add(sgfn, 1 << page_order)); + gfn_add(sgfn, (1UL << page_order) - 1)); p2m->lowest_mapped_gfn = gfn_min(p2m->lowest_mapped_gfn, sgfn); } + if ( need_iommu(p2m->domain) && + (lpae_valid(orig_pte) || lpae_valid(*entry)) ) + rc = iommu_iotlb_flush(p2m->domain, gfn_x(sgfn), 1UL << page_order); + else + rc = 0; + /* * Free the entry only if the original pte was valid and the base * is different (to avoid freeing when permission is changed). @@ -949,12 +953,6 @@ if ( lpae_valid(orig_pte) && entry->p2m.base != orig_pte.p2m.base ) p2m_free_entry(p2m, orig_pte, level); - if ( need_iommu(p2m->domain) && - (lpae_valid(orig_pte) || lpae_valid(*entry)) ) - rc = iommu_iotlb_flush(p2m->domain, gfn_x(sgfn), 1UL << page_order); - else - rc = 0; - out: unmap_domain_page(table); @@ -1298,7 +1296,7 @@ p2m_write_lock(p2m); start = p2m->lowest_mapped_gfn; - end = p2m->max_mapped_gfn; + end = gfn_add(p2m->max_mapped_gfn, 1); for ( ; gfn_x(start) < gfn_x(end); start = gfn_next_boundary(start, order) ) @@ -1363,7 +1361,7 @@ p2m_read_lock(p2m); start = gfn_max(start, p2m->lowest_mapped_gfn); - end = gfn_min(end, p2m->max_mapped_gfn); + end = gfn_min(end, gfn_add(p2m->max_mapped_gfn, 1)); for ( ; gfn_x(start) < gfn_x(end); start = next_gfn ) { diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/setup.c xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/setup.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/setup.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/setup.c 2019-12-11 14:35:39.000000000 +0000 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -725,7 +726,7 @@ /* Register Xen's load address as a boot module. */ xen_bootmodule = add_boot_module(BOOTMOD_XEN, (paddr_t)(uintptr_t)(_start + boot_phys_offset), - (paddr_t)(uintptr_t)(_end - _start + 1), NULL); + (paddr_t)(uintptr_t)(_end - _start), NULL); BUG_ON(!xen_bootmodule); xen_paddr = get_xen_paddr(); @@ -787,8 +788,11 @@ tasklet_subsys_init(); - - xsm_dt_init(); + if ( xsm_dt_init() != 1 ) + warning_add("WARNING: SILO mode is not enabled.\n" + "It has implications on the security of the system,\n" + "unless the communications have been forbidden between\n" + "untrusted domains.\n"); init_maintenance_interrupt(); init_timer_interrupt(); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/time.c xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/time.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/time.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/time.c 2019-12-11 14:35:39.000000000 +0000 @@ -149,7 +149,7 @@ if ( res ) panic("Timer: Cannot initialize platform timer"); - boot_count = READ_SYSREG64(CNTPCT_EL0); + boot_count = get_cycles(); } static void __init init_dt_xen_time(void) @@ -190,7 +190,7 @@ /* Return number of nanoseconds since boot */ s_time_t get_s_time(void) { - uint64_t ticks = READ_SYSREG64(CNTPCT_EL0) - boot_count; + uint64_t ticks = get_cycles() - boot_count; return ticks_to_ns(ticks); } diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/traps.c xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/traps.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/traps.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/traps.c 2019-12-11 14:35:39.000000000 +0000 @@ -404,7 +404,7 @@ static void cpsr_switch_mode(struct cpu_user_regs *regs, int mode) { - uint32_t sctlr = READ_SYSREG32(SCTLR_EL1); + register_t sctlr = READ_SYSREG(SCTLR_EL1); regs->cpsr &= ~(PSR_MODE_MASK|PSR_IT_MASK|PSR_JAZELLE|PSR_BIG_ENDIAN|PSR_THUMB); @@ -420,7 +420,7 @@ static vaddr_t exception_handler32(vaddr_t offset) { - uint32_t sctlr = READ_SYSREG32(SCTLR_EL1); + register_t sctlr = READ_SYSREG(SCTLR_EL1); if (sctlr & SCTLR_V) return 0xffff0000 + offset; @@ -740,7 +740,7 @@ struct reg_ctxt { /* Guest-side state */ - uint32_t sctlr_el1; + register_t sctlr_el1; register_t tcr_el1; uint64_t ttbr0_el1, ttbr1_el1; #ifdef CONFIG_ARM_32 @@ -843,7 +843,7 @@ if ( guest_mode ) { - printk(" SCTLR: %08"PRIx32"\n", ctxt->sctlr_el1); + printk(" SCTLR: %"PRIregister"\n", ctxt->sctlr_el1); printk(" TCR: %08"PRIregister"\n", ctxt->tcr_el1); printk(" TTBR0: %016"PRIx64"\n", ctxt->ttbr0_el1); printk(" TTBR1: %016"PRIx64"\n", ctxt->ttbr1_el1); @@ -915,7 +915,7 @@ printk(" ESR_EL1: %08"PRIx32"\n", ctxt->esr_el1); printk(" FAR_EL1: %016"PRIx64"\n", ctxt->far); printk("\n"); - printk(" SCTLR_EL1: %08"PRIx32"\n", ctxt->sctlr_el1); + printk(" SCTLR_EL1: %"PRIregister"\n", ctxt->sctlr_el1); printk(" TCR_EL1: %08"PRIregister"\n", ctxt->tcr_el1); printk(" TTBR0_EL1: %016"PRIx64"\n", ctxt->ttbr0_el1); printk(" TTBR1_EL1: %016"PRIx64"\n", ctxt->ttbr1_el1); @@ -935,21 +935,11 @@ if ( guest_mode ) { - if ( is_32bit_domain(v->domain) ) + if ( psr_mode_is_32bit(regs->cpsr) ) show_registers_32(regs, ctxt, guest_mode, v); #ifdef CONFIG_ARM_64 - else if ( is_64bit_domain(v->domain) ) - { - if ( psr_mode_is_32bit(regs->cpsr) ) - { - BUG_ON(!usr_mode(regs)); - show_registers_32(regs, ctxt, guest_mode, v); - } - else - { - show_registers_64(regs, ctxt, guest_mode, v); - } - } + else + show_registers_64(regs, ctxt, guest_mode, v); #endif } else @@ -1664,12 +1654,9 @@ void advance_pc(struct cpu_user_regs *regs, const union hsr hsr) { unsigned long itbits, cond, cpsr = regs->cpsr; + bool is_thumb = psr_mode_is_32bit(cpsr) && (cpsr & PSR_THUMB); - /* PSR_IT_MASK bits can only be set for 32-bit processors in Thumb mode. */ - BUG_ON( (!psr_mode_is_32bit(cpsr)||!(cpsr&PSR_THUMB)) - && (cpsr&PSR_IT_MASK) ); - - if ( cpsr&PSR_IT_MASK ) + if ( is_thumb && (cpsr & PSR_IT_MASK) ) { /* The ITSTATE[7:0] block is contained in CPSR[15:10],CPSR[26:25] * diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/vgic/vgic.c xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/vgic/vgic.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/vgic/vgic.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/vgic/vgic.c 2019-12-11 14:35:39.000000000 +0000 @@ -692,11 +692,6 @@ } } -void arch_evtchn_inject(struct vcpu *v) -{ - vgic_inject_irq(v->domain, v, v->domain->arch.evtchn_irq, true); -} - bool vgic_evtchn_irq_pending(struct vcpu *v) { struct vgic_irq *irq; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/vgic.c xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/vgic.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/vgic.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/vgic.c 2019-12-11 14:35:39.000000000 +0000 @@ -597,11 +597,6 @@ return; } -void arch_evtchn_inject(struct vcpu *v) -{ - vgic_inject_irq(v->domain, v, v->domain->arch.evtchn_irq, true); -} - bool vgic_evtchn_irq_pending(struct vcpu *v) { struct pending_irq *p; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/vsmc.c xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/vsmc.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/vsmc.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/vsmc.c 2019-12-11 14:35:39.000000000 +0000 @@ -181,7 +181,7 @@ { bool handled = false; const union hsr hsr = { .bits = regs->hsr }; - register_t funcid = get_user_reg(regs, 0); + uint32_t funcid = get_user_reg(regs, 0); /* * Check immediate value for HVC32, HVC64 and SMC64. @@ -240,7 +240,7 @@ if ( !handled ) { - gprintk(XENLOG_INFO, "Unhandled SMC/HVC: %08"PRIregister"\n", funcid); + gprintk(XENLOG_INFO, "Unhandled SMC/HVC: %#x\n", funcid); /* Inform caller that function is not supported. */ set_user_reg(regs, 0, ARM_SMCCC_ERR_UNKNOWN_FUNCTION); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/arm/xen.lds.S xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/xen.lds.S --- xen-4.11.1+92-g6c33308a8d/xen/arch/arm/xen.lds.S 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/arm/xen.lds.S 2019-12-11 14:35:39.000000000 +0000 @@ -195,14 +195,16 @@ *(.bss.stack_aligned) . = ALIGN(PAGE_SIZE); *(.bss.page_aligned) - *(.bss) - . = ALIGN(SMP_CACHE_BYTES); + . = ALIGN(PAGE_SIZE); __per_cpu_start = .; + *(.bss.percpu.page_aligned) *(.bss.percpu) . = ALIGN(SMP_CACHE_BYTES); *(.bss.percpu.read_mostly) . = ALIGN(SMP_CACHE_BYTES); __per_cpu_data_end = .; + *(.bss) + . = ALIGN(POINTER_ALIGN); __bss_end = .; } :text _end = . ; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/Makefile xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/Makefile --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/Makefile 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/Makefile 2019-12-11 14:35:39.000000000 +0000 @@ -65,6 +65,7 @@ obj-y += time.o obj-y += trace.o obj-y += traps.o +obj-y += tsx.o obj-y += usercopy.o obj-y += x86_emulate.o obj-$(CONFIG_TBOOT) += tboot.o @@ -110,20 +111,20 @@ $(LD_LTO) -r -o $@ $^ prelink-efi_lto.o: $(ALL_OBJS) efi/runtime.o efi/compat.o - $(guard) $(LD_LTO) -r -o $@ $(filter-out %/efi/built_in.o,$^) + $(LD_LTO) -r -o $@ $(filter-out %/efi/built_in.o,$^) # Link it with all the binary objects prelink.o: $(patsubst %/built_in.o,%/built_in_bin.o,$(ALL_OBJS)) prelink_lto.o $(LD) $(LDFLAGS) -r -o $@ $^ prelink-efi.o: $(patsubst %/built_in.o,%/built_in_bin.o,$(ALL_OBJS)) prelink-efi_lto.o efi/boot.init.o - $(guard) $(LD) $(LDFLAGS) -r -o $@ $^ + $(LD) $(LDFLAGS) -r -o $@ $^ else prelink.o: $(ALL_OBJS) $(LD) $(LDFLAGS) -r -o $@ $^ prelink-efi.o: $(ALL_OBJS) efi/boot.init.o efi/runtime.o efi/compat.o - $(guard) $(LD) $(LDFLAGS) -r -o $@ $(filter-out %/efi/built_in.o,$^) + $(LD) $(LDFLAGS) -r -o $@ $(filter-out %/efi/built_in.o,$^) endif $(BASEDIR)/common/symbols-dummy.o: @@ -171,8 +172,6 @@ $(TARGET).efi: VIRT_BASE = 0x$(shell $(NM) efi/relocs-dummy.o | sed -n 's, A VIRT_START$$,,p') $(TARGET).efi: ALT_BASE = 0x$(shell $(NM) efi/relocs-dummy.o | sed -n 's, A ALT_START$$,,p') -# Don't use $(wildcard ...) here - at least make 3.80 expands this too early! -$(TARGET).efi: guard = $(if $(filter y,$(XEN_BUILD_PE)),,:) ifneq ($(build_id_linker),) ifeq ($(call ld-ver-build-id,$(LD) $(filter -m%,$(EFI_LDFLAGS))),y) @@ -190,30 +189,34 @@ endif note_file_option ?= $(note_file) +ifeq ($(filter y,$(XEN_BUILD_PE)),y) $(TARGET).efi: prelink-efi.o $(note_file) efi.lds efi/relocs-dummy.o $(BASEDIR)/common/symbols-dummy.o efi/mkreloc $(foreach base, $(VIRT_BASE) $(ALT_BASE), \ - $(guard) $(LD) $(call EFI_LDFLAGS,$(base)) -T efi.lds -N $< efi/relocs-dummy.o \ + $(LD) $(call EFI_LDFLAGS,$(base)) -T efi.lds -N $< efi/relocs-dummy.o \ $(BASEDIR)/common/symbols-dummy.o $(note_file_option) -o $(@D)/.$(@F).$(base).0 &&) : - $(guard) efi/mkreloc $(foreach base,$(VIRT_BASE) $(ALT_BASE),$(@D)/.$(@F).$(base).0) >$(@D)/.$(@F).0r.S - $(guard) $(NM) -pa --format=sysv $(@D)/.$(@F).$(VIRT_BASE).0 \ - | $(guard) $(BASEDIR)/tools/symbols $(all_symbols) --sysv --sort >$(@D)/.$(@F).0s.S - $(guard) $(MAKE) -f $(BASEDIR)/Rules.mk $(@D)/.$(@F).0r.o $(@D)/.$(@F).0s.o + efi/mkreloc $(foreach base,$(VIRT_BASE) $(ALT_BASE),$(@D)/.$(@F).$(base).0) >$(@D)/.$(@F).0r.S + $(NM) -pa --format=sysv $(@D)/.$(@F).$(VIRT_BASE).0 \ + | $(BASEDIR)/tools/symbols $(all_symbols) --sysv --sort >$(@D)/.$(@F).0s.S + $(MAKE) -f $(BASEDIR)/Rules.mk $(@D)/.$(@F).0r.o $(@D)/.$(@F).0s.o $(foreach base, $(VIRT_BASE) $(ALT_BASE), \ - $(guard) $(LD) $(call EFI_LDFLAGS,$(base)) -T efi.lds -N $< \ + $(LD) $(call EFI_LDFLAGS,$(base)) -T efi.lds -N $< \ $(@D)/.$(@F).0r.o $(@D)/.$(@F).0s.o $(note_file_option) -o $(@D)/.$(@F).$(base).1 &&) : - $(guard) efi/mkreloc $(foreach base,$(VIRT_BASE) $(ALT_BASE),$(@D)/.$(@F).$(base).1) >$(@D)/.$(@F).1r.S - $(guard) $(NM) -pa --format=sysv $(@D)/.$(@F).$(VIRT_BASE).1 \ - | $(guard) $(BASEDIR)/tools/symbols $(all_symbols) --sysv --sort >$(@D)/.$(@F).1s.S - $(guard) $(MAKE) -f $(BASEDIR)/Rules.mk $(@D)/.$(@F).1r.o $(@D)/.$(@F).1s.o - $(guard) $(LD) $(call EFI_LDFLAGS,$(VIRT_BASE)) -T efi.lds -N $< \ + efi/mkreloc $(foreach base,$(VIRT_BASE) $(ALT_BASE),$(@D)/.$(@F).$(base).1) >$(@D)/.$(@F).1r.S + $(NM) -pa --format=sysv $(@D)/.$(@F).$(VIRT_BASE).1 \ + | $(BASEDIR)/tools/symbols $(all_symbols) --sysv --sort >$(@D)/.$(@F).1s.S + $(MAKE) -f $(BASEDIR)/Rules.mk $(@D)/.$(@F).1r.o $(@D)/.$(@F).1s.o + $(LD) $(call EFI_LDFLAGS,$(VIRT_BASE)) -T efi.lds -N $< \ $(@D)/.$(@F).1r.o $(@D)/.$(@F).1s.o $(note_file_option) -o $@ - if $(guard) false; then rm -f $@; echo 'EFI support disabled'; \ - else $(NM) -pa --format=sysv $(@D)/$(@F) \ - | $(BASEDIR)/tools/symbols --xensyms --sysv --sort >$(@D)/$(@F).map; fi + $(NM) -pa --format=sysv $(@D)/$(@F) \ + | $(BASEDIR)/tools/symbols --xensyms --sysv --sort >$(@D)/$(@F).map rm -f $(@D)/.$(@F).[0-9]* $(@D)/..$(@F).[0-9]* +else +$(TARGET).efi: FORCE + rm -f $@; echo 'EFI support disabled' +endif -efi/boot.init.o efi/runtime.o efi/compat.o efi/buildid.o: $(BASEDIR)/arch/x86/efi/built_in.o -efi/boot.init.o efi/runtime.o efi/compat.o efi/buildid.o: ; +efi/boot.init.o efi/runtime.o efi/compat.o efi/buildid.o efi/relocs-dummy.o: $(BASEDIR)/arch/x86/efi/built_in.o +efi/boot.init.o efi/runtime.o efi/compat.o efi/buildid.o efi/relocs-dummy.o: ; asm-offsets.s: $(TARGET_SUBARCH)/asm-offsets.c $(CC) $(filter-out -Wa$(comma)% -flto,$(CFLAGS)) -S -o $@ $< diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/acpi/boot.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/acpi/boot.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/acpi/boot.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/acpi/boot.c 2019-12-11 14:35:39.000000000 +0000 @@ -83,21 +83,26 @@ { struct acpi_madt_local_x2apic *processor = container_of(header, struct acpi_madt_local_x2apic, header); - bool enabled = false; + bool enabled = false, log = false; if (BAD_MADT_ENTRY(processor, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + if ((processor->lapic_flags & ACPI_MADT_ENABLED) || + processor->local_apic_id != 0xffffffff || opt_cpu_info) { + acpi_table_print_madt_entry(header); + log = true; + } /* Record local apic id only when enabled and fitting. */ if (processor->local_apic_id >= MAX_APICS || processor->uid >= MAX_MADT_ENTRIES) { - printk("%sAPIC ID %#x and/or ACPI ID %#x beyond limit" - " - processor ignored\n", - processor->lapic_flags & ACPI_MADT_ENABLED ? - KERN_WARNING "WARNING: " : KERN_INFO, - processor->local_apic_id, processor->uid); + if (log) + printk("%sAPIC ID %#x and/or ACPI ID %#x beyond limit" + " - processor ignored\n", + processor->lapic_flags & ACPI_MADT_ENABLED + ? KERN_WARNING "WARNING: " : KERN_INFO, + processor->local_apic_id, processor->uid); /* * Must not return an error here, to prevent * acpi_table_parse_entries() from terminating early. @@ -132,7 +137,9 @@ if (BAD_MADT_ENTRY(processor, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + if ((processor->lapic_flags & ACPI_MADT_ENABLED) || + processor->id != 0xff || opt_cpu_info) + acpi_table_print_madt_entry(header); /* Record local apic id only when enabled */ if (processor->lapic_flags & ACPI_MADT_ENABLED) { diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/apic.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/apic.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/apic.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/apic.c 2019-12-11 14:35:39.000000000 +0000 @@ -944,15 +944,15 @@ force_iommu = 1; - genapic = apic_x2apic_probe(); - printk("Switched to APIC driver %s.\n", genapic->name); - if ( !x2apic_enabled ) { x2apic_enabled = true; __enable_x2apic(); } + genapic = apic_x2apic_probe(); + printk("Switched to APIC driver %s\n", genapic->name); + restore_out: restore_IO_APIC_setup(ioapic_entries); unmask_8259A(); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/boot/Makefile xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/boot/Makefile --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/boot/Makefile 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/boot/Makefile 2019-12-11 14:35:39.000000000 +0000 @@ -4,7 +4,10 @@ CMDLINE_DEPS = $(DEFS_H_DEPS) video.h -RELOC_DEPS = $(DEFS_H_DEPS) $(BASEDIR)/include/xen/multiboot.h \ +RELOC_DEPS = $(DEFS_H_DEPS) \ + $(BASEDIR)/include/generated/autoconf.h \ + $(BASEDIR)/include/xen/kconfig.h \ + $(BASEDIR)/include/xen/multiboot.h \ $(BASEDIR)/include/xen/multiboot2.h \ $(BASEDIR)/include/public/arch-x86/hvm/start_info.h diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/boot/wakeup.S xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/boot/wakeup.S --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/boot/wakeup.S 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/boot/wakeup.S 2019-12-11 14:35:39.000000000 +0000 @@ -137,6 +137,21 @@ add bootsym_rel(trampoline_xen_phys_start,4,%eax) mov %eax,%cr3 + /* Reapply IA32_MISC_ENABLE modifications from early_init_intel(). */ + mov bootsym_rel(trampoline_misc_enable_off, 4, %esi) + mov bootsym_rel(trampoline_misc_enable_off + 4, 4, %edi) + mov %esi, %eax + or %edi, %eax + jz 1f + mov $MSR_IA32_MISC_ENABLE, %ecx + rdmsr + not %esi + not %edi + and %esi, %eax + and %edi, %edx + wrmsr +1: + /* Will cpuid feature change after resume? */ /* Set up EFER (Extended Feature Enable Register). */ mov bootsym_rel(cpuid_ext_features,4,%edi) diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/boot/x86_64.S xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/boot/x86_64.S --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/boot/x86_64.S 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/boot/x86_64.S 2019-12-11 14:35:39.000000000 +0000 @@ -55,13 +55,13 @@ .align PAGE_SIZE, 0 GLOBAL(boot_cpu_gdt_table) .quad 0x0000000000000000 /* unused */ - .quad 0x00af9a000000ffff /* 0xe008 ring 0 code, 64-bit mode */ - .quad 0x00cf92000000ffff /* 0xe010 ring 0 data */ + .quad 0x00af9b000000ffff /* 0xe008 ring 0 code, 64-bit mode */ + .quad 0x00cf93000000ffff /* 0xe010 ring 0 data */ .quad 0x0000000000000000 /* reserved */ - .quad 0x00cffa000000ffff /* 0xe023 ring 3 code, compatibility */ - .quad 0x00cff2000000ffff /* 0xe02b ring 3 data */ - .quad 0x00affa000000ffff /* 0xe033 ring 3 code, 64-bit mode */ - .quad 0x00cf9a000000ffff /* 0xe038 ring 0 code, compatibility */ + .quad 0x00cffb000000ffff /* 0xe023 ring 3 code, compatibility */ + .quad 0x00cff3000000ffff /* 0xe02b ring 3 data */ + .quad 0x00affb000000ffff /* 0xe033 ring 3 code, 64-bit mode */ + .quad 0x00cf9b000000ffff /* 0xe038 ring 0 code, compatibility */ .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0 .quad 0x0000910000000000 /* per-CPU entry (limit == cpu) */ @@ -70,13 +70,13 @@ /* (compatibility) machine->physical mapping table lives there. */ GLOBAL(boot_cpu_compat_gdt_table) .quad 0x0000000000000000 /* unused */ - .quad 0x00af9a000000ffff /* 0xe008 ring 0 code, 64-bit mode */ - .quad 0x00cf92000000ffff /* 0xe010 ring 0 data */ - .quad 0x00cfba000000ffff /* 0xe019 ring 1 code, compatibility */ - .quad 0x00cfb2000000ffff /* 0xe021 ring 1 data */ - .quad 0x00cffa000000ffff /* 0xe02b ring 3 code, compatibility */ - .quad 0x00cff2000000ffff /* 0xe033 ring 3 data */ - .quad 0x00cf9a000000ffff /* 0xe038 ring 0 code, compatibility */ + .quad 0x00af9b000000ffff /* 0xe008 ring 0 code, 64-bit mode */ + .quad 0x00cf93000000ffff /* 0xe010 ring 0 data */ + .quad 0x00cfbb000000ffff /* 0xe019 ring 1 code, compatibility */ + .quad 0x00cfb3000000ffff /* 0xe021 ring 1 data */ + .quad 0x00cffb000000ffff /* 0xe02b ring 3 code, compatibility */ + .quad 0x00cff3000000ffff /* 0xe033 ring 3 data */ + .quad 0x00cf9b000000ffff /* 0xe038 ring 0 code, compatibility */ .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0 .quad 0x0000910000000000 /* per-CPU entry (limit == cpu) */ .align PAGE_SIZE, 0 diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/cpu/amd.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/cpu/amd.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/cpu/amd.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/cpu/amd.c 2019-12-11 14:35:39.000000000 +0000 @@ -626,7 +626,7 @@ switch(c->x86) { - case 0xf ... 0x17: + case 0xf ... 0x11: disable_c1e(NULL); if (acpi_smi_cmd && (acpi_enable_value | acpi_disable_value)) pv_post_outb_hook = check_disable_c1e; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/cpu/common.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/cpu/common.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/cpu/common.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/cpu/common.c 2019-12-11 14:35:39.000000000 +0000 @@ -472,7 +472,7 @@ this_cpu->c_init(c); - if ( !opt_pku ) + if (c == &boot_cpu_data && !opt_pku) setup_clear_cpu_cap(X86_FEATURE_PKU); /* @@ -734,7 +734,7 @@ unsigned long stack_bottom = get_stack_bottom(), stack_top = stack_bottom & ~(STACK_SIZE - 1); - struct tss_struct *tss = &this_cpu(init_tss); + struct tss64 *tss = &this_cpu(tss_page).tss; struct desc_struct *gdt = this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY; struct desc_struct *compat_gdt = @@ -749,7 +749,7 @@ .limit = (IDT_ENTRIES * sizeof(idt_entry_t)) - 1, }; - *tss = (struct tss_struct){ + *tss = (struct tss64){ /* Main stack for interrupts/exceptions. */ .rsp0 = stack_bottom, @@ -774,16 +774,12 @@ .bitmap = IOBMP_INVALID_OFFSET, }; - _set_tssldt_desc( - gdt + TSS_ENTRY, - (unsigned long)tss, - offsetof(struct tss_struct, __cacheline_filler) - 1, - SYS_DESC_tss_avail); - _set_tssldt_desc( - compat_gdt + TSS_ENTRY, - (unsigned long)tss, - offsetof(struct tss_struct, __cacheline_filler) - 1, - SYS_DESC_tss_busy); + BUILD_BUG_ON(sizeof(*tss) <= 0x67); /* Mandated by the architecture. */ + + _set_tssldt_desc(gdt + TSS_ENTRY, (unsigned long)tss, + sizeof(*tss) - 1, SYS_DESC_tss_avail); + _set_tssldt_desc(compat_gdt + TSS_ENTRY, (unsigned long)tss, + sizeof(*tss) - 1, SYS_DESC_tss_busy); lgdt(&gdtr); lidt(&idtr); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/cpu/vpmu.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/cpu/vpmu.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/cpu/vpmu.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/cpu/vpmu.c 2019-12-11 14:35:39.000000000 +0000 @@ -583,11 +583,36 @@ vpmu->arch_vpmu_ops->arch_vpmu_destroy(v); } + + vpmu_reset(vpmu, VPMU_CONTEXT_ALLOCATED); } -void vpmu_destroy(struct vcpu *v) +static void vpmu_cleanup(struct vcpu *v) { + struct vpmu_struct *vpmu = vcpu_vpmu(v); + void *xenpmu_data; + + spin_lock(&vpmu->vpmu_lock); + vpmu_arch_destroy(v); + xenpmu_data = vpmu->xenpmu_data; + vpmu->xenpmu_data = NULL; + + spin_unlock(&vpmu->vpmu_lock); + + if ( xenpmu_data ) + { + mfn_t mfn = domain_page_map_to_mfn(xenpmu_data); + + ASSERT(mfn_valid(mfn)); + unmap_domain_page_global(xenpmu_data); + put_page_and_type(mfn_to_page(mfn)); + } +} + +void vpmu_destroy(struct vcpu *v) +{ + vpmu_cleanup(v); put_vpmu(v); } @@ -646,9 +671,6 @@ static void pvpmu_finish(struct domain *d, xen_pmu_params_t *params) { struct vcpu *v; - struct vpmu_struct *vpmu; - mfn_t mfn; - void *xenpmu_data; if ( (params->vcpu >= d->max_vcpus) || (d->vcpu[params->vcpu] == NULL) ) return; @@ -657,22 +679,7 @@ if ( v != current ) vcpu_pause(v); - vpmu = vcpu_vpmu(v); - spin_lock(&vpmu->vpmu_lock); - - vpmu_arch_destroy(v); - xenpmu_data = vpmu->xenpmu_data; - vpmu->xenpmu_data = NULL; - - spin_unlock(&vpmu->vpmu_lock); - - if ( xenpmu_data ) - { - mfn = domain_page_map_to_mfn(xenpmu_data); - ASSERT(mfn_valid(mfn)); - unmap_domain_page_global(xenpmu_data); - put_page_and_type(mfn_to_page(mfn)); - } + vpmu_cleanup(v); if ( v != current ) vcpu_unpause(v); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/cpuid.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/cpuid.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/cpuid.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/cpuid.c 2019-12-11 14:35:39.000000000 +0000 @@ -622,6 +622,20 @@ if ( cpu_has_itsc && (d->disable_migrate || d->arch.vtsc) ) __set_bit(X86_FEATURE_ITSC, max_fs); + /* + * On hardware with MSR_TSX_CTRL, the admin may have elected to disable + * TSX and hide the feature bits. Migrating-in VMs may have been booted + * pre-mitigation when the TSX features were visbile. + * + * This situation is compatible (albeit with a perf hit to any TSX code in + * the guest), so allow the feature bits to remain set. + */ + if ( cpu_has_tsx_ctrl ) + { + __set_bit(X86_FEATURE_HLE, max_fs); + __set_bit(X86_FEATURE_RTM, max_fs); + } + /* Clamp the toolstacks choices to reality. */ for ( i = 0; i < ARRAY_SIZE(fs); i++ ) fs[i] &= max_fs[i]; @@ -867,7 +881,8 @@ * damage itself. * * - Enlightened CPUID or CPUID faulting available: - * Xen can fully control what is seen here. Guest kernels need + * Xen can fully control what is seen here. When the guest has + * been configured to have XSAVE available, guest kernels need * to see the leaked OSXSAVE via the enlightened path, but * guest userspace and the native is given architectural * behaviour. @@ -877,7 +892,8 @@ */ /* OSXSAVE clear in policy. Fast-forward CR4 back in. */ if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) || - (regs->entry_vector == TRAP_invalid_op && + (p->basic.xsave && + regs->entry_vector == TRAP_invalid_op && guest_kernel_mode(v, regs) && (read_cr4() & X86_CR4_OSXSAVE)) ) res->c |= cpufeat_mask(X86_FEATURE_OSXSAVE); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/crash.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/crash.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/crash.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/crash.c 2019-12-11 14:35:39.000000000 +0000 @@ -30,6 +30,7 @@ #include #include #include +#include static cpumask_t waiting_to_crash; static unsigned int crashing_cpu; @@ -155,6 +156,12 @@ msecs--; } + /* + * We may have NMI'd another CPU while it was holding the console lock. + * It won't be in a position to release the lock... + */ + console_force_unlock(); + /* Leave a hint of how well we did trying to shoot down the other cpus */ if ( cpumask_empty(&waiting_to_crash) ) printk("Shot down all CPUs\n"); @@ -172,15 +179,20 @@ */ iommu_crash_shutdown(); - __stop_this_cpu(); + if ( cpu_online(cpu) ) + { + __stop_this_cpu(); - /* This is a bit of a hack due to the problems with the x2apic_enabled - * variable, but we can't do any better without a significant refactoring - * of the APIC code */ - x2apic_enabled = (current_local_apic_mode() == APIC_MODE_X2APIC); + /* + * This is a bit of a hack due to the problems with the x2apic_enabled + * variable, but we can't do any better without a significant + * refactoring of the APIC code + */ + x2apic_enabled = (current_local_apic_mode() == APIC_MODE_X2APIC); - disable_IO_APIC(); - hpet_disable(); + disable_IO_APIC(); + hpet_disable(); + } } void machine_crash_shutdown(void) diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/domain.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/domain.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/domain.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/domain.c 2019-12-11 14:35:39.000000000 +0000 @@ -112,7 +112,7 @@ * this case, heap corruption or #PF can occur (when heap debugging is * enabled). For example, even printk() can involve tasklet scheduling, * which touches per-cpu vars. - * + * * Consider very carefully when adding code to *dead_idle. Most hypervisor * subsystems are unsafe to call. */ @@ -406,9 +406,6 @@ xfree(v->arch.msr); v->arch.msr = NULL; - if ( !is_idle_domain(v->domain) ) - vpmu_destroy(v); - if ( is_hvm_vcpu(v) ) hvm_vcpu_destroy(v); else @@ -1074,9 +1071,15 @@ rc = -ERESTART; /* Fallthrough */ case -ERESTART: + /* + * NB that we're putting the kernel-mode table + * here, which we've already successfully + * validated above; hence partial = false; + */ v->arch.old_guest_ptpg = NULL; v->arch.old_guest_table = pagetable_get_page(v->arch.guest_table); + v->arch.old_guest_table_partial = false; v->arch.guest_table = pagetable_null(); break; default: @@ -1510,21 +1513,24 @@ bool rc; struct guest_memory_policy policy = { .nested_guest_mode = false }; void __user *guest_handle = NULL; + struct vcpu_runstate_info runstate; if ( guest_handle_is_null(runstate_guest(v)) ) return true; update_guest_memory_policy(v, &policy); + memcpy(&runstate, &v->runstate, sizeof(runstate)); + if ( VM_ASSIST(v->domain, runstate_update_flag) ) { guest_handle = has_32bit_shinfo(v->domain) ? &v->runstate_guest.compat.p->state_entry_time + 1 : &v->runstate_guest.native.p->state_entry_time + 1; guest_handle--; - v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE; + runstate.state_entry_time |= XEN_RUNSTATE_UPDATE; __raw_copy_to_guest(guest_handle, - (void *)(&v->runstate.state_entry_time + 1) - 1, 1); + (void *)(&runstate.state_entry_time + 1) - 1, 1); smp_wmb(); } @@ -1532,20 +1538,20 @@ { struct compat_vcpu_runstate_info info; - XLAT_vcpu_runstate_info(&info, &v->runstate); + XLAT_vcpu_runstate_info(&info, &runstate); __copy_to_guest(v->runstate_guest.compat, &info, 1); rc = true; } else - rc = __copy_to_guest(runstate_guest(v), &v->runstate, 1) != - sizeof(v->runstate); + rc = __copy_to_guest(runstate_guest(v), &runstate, 1) != + sizeof(runstate); if ( guest_handle ) { - v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE; + runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE; smp_wmb(); __raw_copy_to_guest(guest_handle, - (void *)(&v->runstate.state_entry_time + 1) - 1, 1); + (void *)(&runstate.state_entry_time + 1) - 1, 1); } update_guest_memory_policy(v, &policy); @@ -1838,9 +1844,34 @@ break; case -ERESTART: case -EINTR: + /* + * -EINTR means PGT_validated has been re-set; re-set + * PGT_pinned again so that it gets picked up next time + * around. + * + * -ERESTART, OTOH, means PGT_partial is set instead. Put + * it back on the list, but don't set PGT_pinned; the + * section below will finish off de-validation. But we do + * need to drop the general ref associated with + * PGT_pinned, since put_page_and_type_preemptible() + * didn't do it. + * + * NB we can do an ASSERT for PGT_validated, since we + * "own" the type ref; but theoretically, the PGT_partial + * could be cleared by someone else. + */ + if ( ret == -EINTR ) + { + ASSERT(page->u.inuse.type_info & PGT_validated); + set_bit(_PGT_pinned, &page->u.inuse.type_info); + } + else + put_page(page); + ret = -ERESTART; + + /* Put the page back on the list and drop the ref we grabbed above */ page_list_add(page, list); - set_bit(_PGT_pinned, &page->u.inuse.type_info); put_page(page); goto out; default: @@ -1885,6 +1916,25 @@ goto out; case -ERESTART: page_list_add(page, list); + /* + * PGT_partial holds a type ref and a general ref. + * If we came in with PGT_partial set, then we 1) + * don't need to grab an extra type count, and 2) + * do need to drop the extra page ref we grabbed + * at the top of the loop. If we didn't come in + * with PGT_partial set, we 1) do need to drab an + * extra type count, but 2) can transfer the page + * ref we grabbed above to it. + * + * Note that we must increment type_info before + * setting PGT_partial. Theoretically it should + * be safe to drop the page ref before setting + * PGT_partial, but do it afterwards just to be + * extra safe. + */ + if ( !(x & PGT_partial) ) + page->u.inuse.type_info++; + smp_wmb(); page->u.inuse.type_info |= PGT_partial; if ( x & PGT_partial ) put_page(page); @@ -1939,12 +1989,17 @@ if ( ret ) return ret; - /* Drop the in-use references to page-table bases. */ + /* + * Drop the in-use references to page-table bases and clean + * up vPMU instances. + */ for_each_vcpu ( d, v ) { ret = vcpu_destroy_pagetables(v); if ( ret ) return ret; + + vpmu_destroy(v); } if ( is_pv_domain(d) ) @@ -2062,7 +2117,7 @@ * pending flag. These values may fluctuate (after all, we hold no * locks) but the key insight is that each change will cause * evtchn_upcall_pending to be polled. - * + * * NB2. We save the running flag across the unblock to avoid a needless * IPI for domains that we IPI'd to unblock. */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/domctl.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/domctl.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/domctl.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/domctl.c 2019-12-11 14:35:39.000000000 +0000 @@ -210,11 +210,15 @@ if ( is_pv_domain(d) && ((levelling_caps & LCAP_7ab0) == LCAP_7ab0) ) { uint64_t mask = cpuidmask_defaults._7ab0; - uint32_t eax = ctl->eax; - uint32_t ebx = p->feat._7b0; + /* + * Leaf 7[0].eax is max_subleaf, not a feature mask. Take it + * wholesale from the policy, but clamp the features in 7[0].ebx + * per usual. + */ if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) - mask &= ((uint64_t)eax << 32) | ebx; + mask = (((uint64_t)p->feat.max_subleaf << 32) | + ((uint32_t)mask & p->feat._7b0)); d->arch.pv_domain.cpuidmasks->_7ab0 = mask; } @@ -484,6 +488,26 @@ ret = -EFAULT; break; } + + /* + * Avoid checking for preemption when the `hostp2m' lock isn't + * involve, i.e. non-translated guest, and avoid preemption on + * the last iteration. + */ + if ( paging_mode_translate(d) && + likely((i + 1) < num) && hypercall_preempt_check() ) + { + domctl->u.getpageframeinfo3.num = num - i - 1; + domctl->u.getpageframeinfo3.array.p = + guest_handle + ((i + 1) * width); + if ( __copy_to_guest(u_domctl, domctl, 1) ) + { + ret = -EFAULT; + break; + } + return hypercall_create_continuation(__HYPERVISOR_domctl, + "h", u_domctl); + } } break; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/efi/efi-boot.h xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/efi/efi-boot.h --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/efi/efi-boot.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/efi/efi-boot.h 2019-12-11 14:35:39.000000000 +0000 @@ -528,9 +528,10 @@ bpp = set_color(mode_info->PixelInformation.BlueMask, bpp, &vga_console_info.u.vesa_lfb.blue_pos, &vga_console_info.u.vesa_lfb.blue_size); - bpp = set_color(mode_info->PixelInformation.ReservedMask, bpp, - &vga_console_info.u.vesa_lfb.rsvd_pos, - &vga_console_info.u.vesa_lfb.rsvd_size); + if ( mode_info->PixelInformation.ReservedMask ) + bpp = set_color(mode_info->PixelInformation.ReservedMask, bpp, + &vga_console_info.u.vesa_lfb.rsvd_pos, + &vga_console_info.u.vesa_lfb.rsvd_size); if ( bpp > 0 ) break; /* fall through */ @@ -550,6 +551,7 @@ vga_console_info.u.vesa_lfb.bytes_per_line = (mode_info->PixelsPerScanLine * bpp + 7) >> 3; vga_console_info.u.vesa_lfb.lfb_base = gop->Mode->FrameBufferBase; + vga_console_info.u.vesa_lfb.ext_lfb_base = gop->Mode->FrameBufferBase >> 32; vga_console_info.u.vesa_lfb.lfb_size = (gop->Mode->FrameBufferSize + 0xffff) >> 16; } diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/flushtlb.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/flushtlb.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/flushtlb.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/flushtlb.c 2019-12-11 14:35:39.000000000 +0000 @@ -76,17 +76,18 @@ static void do_tlb_flush(void) { + unsigned long cr4; u32 t = pre_flush(); if ( use_invpcid ) invpcid_flush_all(); - else + else if ( (cr4 = read_cr4()) & X86_CR4_PGE ) { - unsigned long cr4 = read_cr4(); - - write_cr4(cr4 ^ X86_CR4_PGE); + write_cr4(cr4 & ~X86_CR4_PGE); write_cr4(cr4); } + else + write_cr3(read_cr3()); post_flush(t); } diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/hpet.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/hpet.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/hpet.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/hpet.c 2019-12-11 14:35:39.000000000 +0000 @@ -258,10 +258,14 @@ * Detect time values set in the past. This is hard to do for 32-bit * comparators as the timer does not have to be set that far in the future * for the counter difference to wrap a 32-bit signed integer. We fudge - * by looking for a 'small' time value in the past. + * by looking for a 'small' time value in the past. However, if we + * are restoring after migrate, treat any wrap as past since the value + * is unlikely to be 'small'. */ if ( (int64_t)diff < 0 ) - diff = (timer_is_32bit(h, tn) && (-diff > HPET_TINY_TIME_SPAN)) + diff = (timer_is_32bit(h, tn) && + vhpet_domain(h)->creation_finished && + (-diff > HPET_TINY_TIME_SPAN)) ? (uint32_t)diff : 0; destroy_periodic_time(&h->pt[tn]); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/hvm.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/hvm.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/hvm.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/hvm.c 2019-12-11 14:35:39.000000000 +0000 @@ -429,6 +429,8 @@ hvm_set_guest_tsc(v, guest_tsc); v->arch.hvm_vcpu.msr_tsc_adjust += v->arch.hvm_vcpu.cache_tsc_offset - tsc_offset; + if ( v == current ) + update_vcpu_system_time(v); } static void hvm_set_guest_tsc_adjust(struct vcpu *v, u64 tsc_adjust) @@ -437,6 +439,8 @@ - v->arch.hvm_vcpu.msr_tsc_adjust; hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset, 0); v->arch.hvm_vcpu.msr_tsc_adjust = tsc_adjust; + if ( v == current ) + update_vcpu_system_time(v); } u64 hvm_get_guest_tsc_fixed(struct vcpu *v, uint64_t at_tsc) @@ -1706,6 +1710,7 @@ struct p2m_domain *p2m, *hostp2m; int rc, fall_through = 0, paged = 0; int sharing_enomem = 0; + unsigned int page_order = 0; vm_event_request_t *req_ptr = NULL; bool_t ap2m_active, sync = 0; @@ -1774,7 +1779,7 @@ hostp2m = p2m_get_hostp2m(currd); mfn = get_gfn_type_access(hostp2m, gfn, &p2mt, &p2ma, P2M_ALLOC | (npfec.write_access ? P2M_UNSHARE : 0), - NULL); + &page_order); if ( ap2m_active ) { @@ -1786,7 +1791,7 @@ goto out; } - mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, NULL); + mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, &page_order); } else p2m = hostp2m; @@ -1828,6 +1833,24 @@ break; } + /* + * Workaround for XSA-304 / CVE-2018-12207. If we take an execution + * fault against a non-executable superpage, shatter it to regain + * execute permissions. + */ + if ( page_order > 0 && npfec.insn_fetch && npfec.present && !violation ) + { + int res = p2m_set_entry(p2m, _gfn(gfn), mfn, PAGE_ORDER_4K, + p2mt, p2ma); + + if ( res ) + printk(XENLOG_ERR "Failed to shatter gfn %"PRI_gfn": %d\n", + gfn, res); + + rc = !res; + goto out_put_gfn; + } + if ( violation ) { /* Should #VE be emulated for this fault? */ @@ -2893,7 +2916,7 @@ void hvm_task_switch( uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason, - int32_t errcode) + int32_t errcode, unsigned int insn_len) { struct vcpu *v = current; struct cpu_user_regs *regs = guest_cpu_user_regs(); @@ -2967,7 +2990,7 @@ if ( taskswitch_reason == TSW_iret ) eflags &= ~X86_EFLAGS_NT; - tss.eip = regs->eip; + tss.eip = regs->eip + insn_len; tss.eflags = eflags; tss.eax = regs->eax; tss.ecx = regs->ecx; @@ -4679,12 +4702,10 @@ if ( rc > 0 ) { a.u.set_mem_access_multi.opaque = rc; + rc = -ERESTART; if ( __copy_field_to_guest(guest_handle_cast(arg, xen_hvm_altp2m_op_t), &a, u.set_mem_access_multi.opaque) ) rc = -EFAULT; - else - rc = hypercall_create_continuation(__HYPERVISOR_hvm_op, "lh", - HVMOP_altp2m, arg); } break; @@ -4786,14 +4807,8 @@ switch ( a.cmd ) { case HVMOP_altp2m_set_mem_access_multi: - /* - * The return code can be positive only if it is the return value - * of hypercall_create_continuation. In this case, the opaque value - * must be copied back to the guest. - */ - if ( rc > 0 ) + if ( rc == -ERESTART ) { - ASSERT(rc == __HYPERVISOR_hvm_op); a.u.set_mem_access_multi.opaque = nat.altp2m_op->u.set_mem_access_multi.opaque; if ( __copy_field_to_guest(guest_handle_cast(arg, diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/ioreq.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/ioreq.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/ioreq.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/ioreq.c 2019-12-11 14:35:39.000000000 +0000 @@ -1246,7 +1246,7 @@ return 0; fail: - while ( id-- != 0 ) + while ( ++id != MAX_NR_IOREQ_SERVERS ) { s = GET_IOREQ_SERVER(d, id); @@ -1334,7 +1334,7 @@ d->arch.cpuid->x86_vendor == X86_VENDOR_AMD && (x86_fam = get_cpu_family( d->arch.cpuid->basic.raw_fms, NULL, NULL)) > 0x10 && - x86_fam <= 0x17 ) + x86_fam < 0x17 ) { uint64_t msr_val; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/irq.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/irq.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/irq.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/irq.c 2019-12-11 14:35:39.000000000 +0000 @@ -497,6 +497,15 @@ struct hvm_domain *plat = &v->domain->arch.hvm_domain; int vector; + /* + * Always call vlapic_sync_pir_to_irr so that PIR is synced into IRR when + * using posted interrupts. Note this is also done by + * vlapic_has_pending_irq but depending on which interrupts are pending + * hvm_vcpu_has_pending_irq will return early without calling + * vlapic_has_pending_irq. + */ + vlapic_sync_pir_to_irr(v); + if ( unlikely(v->nmi_pending) ) return hvm_intack_nmi; @@ -562,12 +571,6 @@ return !hvm_interrupt_blocked(v, intack); } -void arch_evtchn_inject(struct vcpu *v) -{ - if ( is_hvm_vcpu(v) ) - hvm_assert_evtchn_irq(v); -} - static void irq_dump(struct domain *d) { struct hvm_irq *hvm_irq = hvm_domain_irq(d); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/svm/emulate.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/svm/emulate.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/svm/emulate.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/svm/emulate.c 2019-12-11 14:35:39.000000000 +0000 @@ -160,6 +160,60 @@ } /* + * TASK_SWITCH vmexits never provide an instruction length. We must always + * decode under %rip to find the answer. + */ +unsigned int svm_get_task_switch_insn_len(void) +{ + struct hvm_emulate_ctxt ctxt; + struct x86_emulate_state *state; + unsigned int emul_len, modrm_reg; + + hvm_emulate_init_once(&ctxt, NULL, guest_cpu_user_regs()); + hvm_emulate_init_per_insn(&ctxt, NULL, 0); + state = x86_decode_insn(&ctxt.ctxt, hvmemul_insn_fetch); + if ( IS_ERR_OR_NULL(state) ) + return 0; + + emul_len = x86_insn_length(state, &ctxt.ctxt); + + /* + * Check for an instruction which can cause a task switch. Any far + * jmp/call/ret, any software interrupt/exception with trap semantics + * (except icebp - handled specially), and iret. + */ + switch ( ctxt.ctxt.opcode ) + { + case 0xff: /* Grp 5 */ + /* call / jmp (far, absolute indirect) */ + if ( (unsigned int)x86_insn_modrm(state, NULL, &modrm_reg) >= 3 || + (modrm_reg != 3 && modrm_reg != 5) ) + { + default: + printk(XENLOG_G_WARNING "Bad instruction for task switch\n"); + hvm_dump_emulation_state(XENLOG_G_WARNING, "SVM Insn len", + &ctxt, X86EMUL_UNHANDLEABLE); + emul_len = 0; + break; + } + /* Fallthrough */ + case 0x9a: /* call (far, absolute) */ + case 0xca: /* ret imm16 (far) */ + case 0xcb: /* ret (far) */ + case 0xcc: /* int3 */ + case 0xcd: /* int imm8 */ + case 0xce: /* into */ + case 0xcf: /* iret */ + case 0xea: /* jmp (far, absolute) */ + break; + } + + x86_emulate_free_state(state); + + return emul_len; +} + +/* * Local variables: * mode: C * c-file-style: "BSD" diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/svm/svm.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/svm/svm.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/svm/svm.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/svm/svm.c 2019-12-11 14:35:39.000000000 +0000 @@ -172,24 +172,6 @@ svm_intercept_msr(v, msr, MSR_INTERCEPT_WRITE); } -static void svm_set_icebp_interception(struct domain *d, bool enable) -{ - const struct vcpu *v; - - for_each_vcpu ( d, v ) - { - struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; - uint32_t intercepts = vmcb_get_general2_intercepts(vmcb); - - if ( enable ) - intercepts |= GENERAL2_INTERCEPT_ICEBP; - else - intercepts &= ~GENERAL2_INTERCEPT_ICEBP; - - vmcb_set_general2_intercepts(vmcb, intercepts); - } -} - static void svm_save_dr(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; @@ -683,21 +665,21 @@ cp->extd.ibpb ? MSR_INTERCEPT_NONE : MSR_INTERCEPT_RW); } -static void svm_sync_vmcb(struct vcpu *v, enum vmcb_sync_state new_state) +void svm_sync_vmcb(struct vcpu *v, enum vmcb_sync_state new_state) { struct arch_svm_struct *arch_svm = &v->arch.hvm_svm; if ( new_state == vmcb_needs_vmsave ) { if ( arch_svm->vmcb_sync_state == vmcb_needs_vmload ) - svm_vmload(arch_svm->vmcb); + svm_vmload_pa(arch_svm->vmcb_pa); arch_svm->vmcb_sync_state = new_state; } else { if ( arch_svm->vmcb_sync_state == vmcb_needs_vmsave ) - svm_vmsave(arch_svm->vmcb); + svm_vmsave_pa(arch_svm->vmcb_pa); if ( arch_svm->vmcb_sync_state != vmcb_needs_vmload ) arch_svm->vmcb_sync_state = new_state; @@ -2618,7 +2600,6 @@ .msr_read_intercept = svm_msr_read_intercept, .msr_write_intercept = svm_msr_write_intercept, .enable_msr_interception = svm_enable_msr_interception, - .set_icebp_interception = svm_set_icebp_interception, .set_rdtsc_exiting = svm_set_rdtsc_exiting, .set_descriptor_access_exiting = svm_set_descriptor_access_exiting, .get_insn_bytes = svm_get_insn_bytes, @@ -2921,7 +2902,52 @@ case VMEXIT_TASK_SWITCH: { enum hvm_task_switch_reason reason; - int32_t errcode = -1; + int32_t errcode = -1, insn_len = -1; + + /* + * All TASK_SWITCH intercepts have fault-like semantics. NRIP is + * never provided, even for instruction-induced task switches, but we + * need to know the instruction length in order to set %eip suitably + * in the outgoing TSS. + * + * For a task switch which vectored through the IDT, look at the type + * to distinguish interrupts/exceptions from instruction based + * switches. + */ + if ( vmcb->exitintinfo.fields.v ) + { + switch ( vmcb->exitintinfo.fields.type ) + { + /* + * #BP and #OF are from INT3/INTO respectively. #DB from + * ICEBP is handled specially, and already has fault + * semantics. + */ + case X86_EVENTTYPE_HW_EXCEPTION: + if ( vmcb->exitintinfo.fields.vector == TRAP_int3 || + vmcb->exitintinfo.fields.vector == TRAP_overflow ) + break; + /* Fallthrough */ + case X86_EVENTTYPE_EXT_INTR: + case X86_EVENTTYPE_NMI: + insn_len = 0; + break; + } + + /* + * The common logic above will have forwarded the vectoring + * information. Undo this as we are going to emulate. + */ + vmcb->eventinj.bytes = 0; + } + + /* + * insn_len being -1 indicates that we have an instruction-induced + * task switch. Decode under %rip to find its length. + */ + if ( insn_len < 0 && (insn_len = svm_get_task_switch_insn_len()) == 0 ) + goto crash_or_fault; + if ( (vmcb->exitinfo2 >> 36) & 1 ) reason = TSW_iret; else if ( (vmcb->exitinfo2 >> 38) & 1 ) @@ -2931,15 +2957,7 @@ if ( (vmcb->exitinfo2 >> 44) & 1 ) errcode = (uint32_t)vmcb->exitinfo2; - /* - * Some processors set the EXITINTINFO field when the task switch - * is caused by a task gate in the IDT. In this case we will be - * emulating the event injection, so we do not want the processor - * to re-inject the original event! - */ - vmcb->eventinj.bytes = 0; - - hvm_task_switch((uint16_t)vmcb->exitinfo1, reason, errcode); + hvm_task_switch(vmcb->exitinfo1, reason, errcode, insn_len); break; } @@ -3136,6 +3154,7 @@ gprintk(XENLOG_ERR, "Unexpected vmexit: reason %#"PRIx64", " "exitinfo1 %#"PRIx64", exitinfo2 %#"PRIx64"\n", exit_reason, vmcb->exitinfo1, vmcb->exitinfo2); + crash_or_fault: svm_crash_or_fault(v); break; } diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/svm/svmdebug.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/svm/svmdebug.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/svm/svmdebug.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/svm/svmdebug.c 2019-12-11 14:35:39.000000000 +0000 @@ -29,6 +29,15 @@ void svm_vmcb_dump(const char *from, const struct vmcb_struct *vmcb) { + struct vcpu *curr = current; + + /* + * If we are dumping the VMCB currently in context, some guest state may + * still be cached in hardware. Retrieve it. + */ + if ( vmcb == curr->arch.hvm_svm.vmcb ) + svm_sync_vmcb(curr, vmcb_in_sync); + printk("Dumping guest's current state at %s...\n", from); printk("Size of VMCB = %zu, paddr = %"PRIpaddr", vaddr = %p\n", sizeof(struct vmcb_struct), virt_to_maddr(vmcb), vmcb); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/svm/vmcb.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/svm/vmcb.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/svm/vmcb.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/svm/vmcb.c 2019-12-11 14:35:39.000000000 +0000 @@ -73,7 +73,7 @@ GENERAL2_INTERCEPT_STGI | GENERAL2_INTERCEPT_CLGI | GENERAL2_INTERCEPT_SKINIT | GENERAL2_INTERCEPT_MWAIT | GENERAL2_INTERCEPT_WBINVD | GENERAL2_INTERCEPT_MONITOR | - GENERAL2_INTERCEPT_XSETBV; + GENERAL2_INTERCEPT_XSETBV | GENERAL2_INTERCEPT_ICEBP; /* Intercept all debug-register writes. */ vmcb->_dr_intercepts = ~0u; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/vlapic.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/vlapic.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/vlapic.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/vlapic.c 2019-12-11 14:35:39.000000000 +0000 @@ -113,8 +113,7 @@ static int vlapic_find_highest_irr(struct vlapic *vlapic) { - if ( hvm_funcs.sync_pir_to_irr ) - hvm_funcs.sync_pir_to_irr(vlapic_vcpu(vlapic)); + vlapic_sync_pir_to_irr(vlapic_vcpu(vlapic)); return vlapic_find_highest_vector(&vlapic->regs->data[APIC_IRR]); } @@ -974,6 +973,7 @@ case APIC_SPIV: if ( msr_content & ~(APIC_VECTOR_MASK | APIC_SPIV_APIC_ENABLED | + APIC_SPIV_FOCUS_DISABLED | (VLAPIC_VERSION & APIC_LVR_DIRECTED_EOI ? APIC_SPIV_DIRECTED_EOI : 0)) ) return X86EMUL_UNHANDLEABLE; @@ -1437,8 +1437,7 @@ for_each_vcpu ( d, v ) { - if ( hvm_funcs.sync_pir_to_irr ) - hvm_funcs.sync_pir_to_irr(v); + vlapic_sync_pir_to_irr(v); s = vcpu_vlapic(v); if ( (rc = hvm_save_entry(LAPIC_REGS, v->vcpu_id, h, s->regs)) != 0 ) diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/vmx/vmcs.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/vmx/vmcs.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/vmx/vmcs.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/vmx/vmcs.c 2019-12-11 14:35:39.000000000 +0000 @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -67,6 +68,7 @@ static bool_t __read_mostly opt_pml_enabled = 1; static s8 __read_mostly opt_ept_ad = -1; +int8_t __read_mostly opt_ept_exec_sp = -1; /* * The 'ept' parameter controls functionalities that depend on, or impact the @@ -94,6 +96,8 @@ opt_pml_enabled = val; else if ( !cmdline_strcmp(s, "ad") ) opt_ept_ad = val; + else if ( !cmdline_strcmp(s, "exec-sp") ) + opt_ept_exec_sp = val; else rc = -EINVAL; @@ -104,6 +108,55 @@ } custom_param("ept", parse_ept_param); +static int parse_ept_param_runtime(const char *s) +{ + struct domain *d; + int val; + + if ( !cpu_has_vmx_ept || !hvm_funcs.hap_supported || + !(hvm_funcs.hap_capabilities & + (HVM_HAP_SUPERPAGE_2MB | HVM_HAP_SUPERPAGE_1GB)) ) + { + printk("VMX: EPT not available, or not in use - ignoring\n"); + return 0; + } + + if ( (val = parse_boolean("exec-sp", s, NULL)) < 0 ) + return -EINVAL; + + opt_ept_exec_sp = val; + + rcu_read_lock(&domlist_read_lock); + for_each_domain ( d ) + { + /* PV, or HVM Shadow domain? Not applicable. */ + if ( !paging_mode_hap(d) ) + continue; + + /* Hardware domain? Not applicable. */ + if ( is_hardware_domain(d) ) + continue; + + /* Nested Virt? Broken and exec_sp forced on to avoid livelocks. */ + if ( nestedhvm_enabled(d) ) + continue; + + /* Setting already matches? No need to rebuild the p2m. */ + if ( d->arch.hvm_domain.vmx.exec_sp == val ) + continue; + + d->arch.hvm_domain.vmx.exec_sp = val; + p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_rw); + } + rcu_read_unlock(&domlist_read_lock); + + printk("VMX: EPT executable superpages %sabled\n", + val ? "en" : "dis"); + + return 0; +} +custom_runtime_only_param("ept", parse_ept_param_runtime); + /* Dynamic (run-time adjusted) execution control flags. */ u32 vmx_pin_based_exec_control __read_mostly; u32 vmx_cpu_based_exec_control __read_mostly; @@ -807,7 +860,7 @@ (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY)); __vmwrite(HOST_IDTR_BASE, (unsigned long)idt_tables[cpu]); - __vmwrite(HOST_TR_BASE, (unsigned long)&per_cpu(init_tss, cpu)); + __vmwrite(HOST_TR_BASE, (unsigned long)&per_cpu(tss_page, cpu).tss); __vmwrite(HOST_SYSENTER_ESP, get_stack_bottom()); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/vmx/vmx.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/vmx/vmx.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/vmx/vmx.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/vmx/vmx.c 2019-12-11 14:35:39.000000000 +0000 @@ -404,6 +404,12 @@ d->arch.ctxt_switch = &csw; + /* + * Work around CVE-2018-12207? The hardware domain is already permitted + * to reboot the system, so doesn't need mitigating against DoS's. + */ + d->arch.hvm_domain.vmx.exec_sp = is_hardware_domain(d) || opt_ept_exec_sp; + if ( !has_vlapic(d) ) return 0; @@ -2174,7 +2180,14 @@ mfn = get_gfn_query_unlocked(d, gfn_x(vcpu_altp2m(v).veinfo_gfn), &t); if ( !mfn_eq(mfn, INVALID_MFN) ) + { __vmwrite(VIRT_EXCEPTION_INFO, mfn_x(mfn) << PAGE_SHIFT); + /* + * Make sure we have an up-to-date EPTP_INDEX when + * setting SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS. + */ + __vmwrite(EPTP_INDEX, vcpu_altp2m(v).p2midx); + } else v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS; @@ -2406,7 +2419,103 @@ } static void __init lbr_tsx_fixup_check(void); -static void __init bdw_erratum_bdf14_fixup_check(void); +static void __init bdf93_fixup_check(void); + +/* + * Calculate whether the CPU is vulnerable to Instruction Fetch page + * size-change MCEs. + */ +static bool __init has_if_pschange_mc(void) +{ + uint64_t caps = 0; + + /* + * If we are virtualised, there is nothing we can do. Our EPT tables are + * shadowed by our hypervisor, and not walked by hardware. + */ + if ( cpu_has_hypervisor ) + return false; + + if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) ) + rdmsrl(MSR_ARCH_CAPABILITIES, caps); + + if ( caps & ARCH_CAPS_IF_PSCHANGE_MC_NO ) + return false; + + /* + * IF_PSCHANGE_MC is only known to affect Intel Family 6 processors at + * this time. + */ + if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + boot_cpu_data.x86 != 6 ) + return false; + + switch ( boot_cpu_data.x86_model ) + { + /* + * Core processors since at least Nehalem are vulnerable. + */ + case 0x1f: /* Auburndale / Havendale */ + case 0x1e: /* Nehalem */ + case 0x1a: /* Nehalem EP */ + case 0x2e: /* Nehalem EX */ + case 0x25: /* Westmere */ + case 0x2c: /* Westmere EP */ + case 0x2f: /* Westmere EX */ + case 0x2a: /* SandyBridge */ + case 0x2d: /* SandyBridge EP/EX */ + case 0x3a: /* IvyBridge */ + case 0x3e: /* IvyBridge EP/EX */ + case 0x3c: /* Haswell */ + case 0x3f: /* Haswell EX/EP */ + case 0x45: /* Haswell D */ + case 0x46: /* Haswell H */ + case 0x3d: /* Broadwell */ + case 0x47: /* Broadwell H */ + case 0x4f: /* Broadwell EP/EX */ + case 0x56: /* Broadwell D */ + case 0x4e: /* Skylake M */ + case 0x5e: /* Skylake D */ + case 0x55: /* Skylake-X / Cascade Lake */ + case 0x8e: /* Kaby / Coffee / Whiskey Lake M */ + case 0x9e: /* Kaby / Coffee / Whiskey Lake D */ + return true; + + /* + * Atom processors are not vulnerable. + */ + case 0x1c: /* Pineview */ + case 0x26: /* Lincroft */ + case 0x27: /* Penwell */ + case 0x35: /* Cloverview */ + case 0x36: /* Cedarview */ + case 0x37: /* Baytrail / Valleyview (Silvermont) */ + case 0x4d: /* Avaton / Rangely (Silvermont) */ + case 0x4c: /* Cherrytrail / Brasswell */ + case 0x4a: /* Merrifield */ + case 0x5a: /* Moorefield */ + case 0x5c: /* Goldmont */ + case 0x5d: /* SoFIA 3G Granite/ES2.1 */ + case 0x65: /* SoFIA LTE AOSP */ + case 0x5f: /* Denverton */ + case 0x6e: /* Cougar Mountain */ + case 0x75: /* Lightning Mountain */ + case 0x7a: /* Gemini Lake */ + case 0x86: /* Jacobsville */ + + /* + * Knights processors are not vulnerable. + */ + case 0x57: /* Knights Landing */ + case 0x85: /* Knights Mill */ + return false; + + default: + printk("Unrecognised CPU model %#x - assuming vulnerable to IF_PSCHANGE_MC\n", + boot_cpu_data.x86_model); + return true; + } +} const struct hvm_function_table * __init start_vmx(void) { @@ -2428,6 +2537,17 @@ */ if ( cpu_has_vmx_ept && (cpu_has_vmx_pat || opt_force_ept) ) { + bool cpu_has_bug_pschange_mc = has_if_pschange_mc(); + + if ( opt_ept_exec_sp == -1 ) + { + /* Default to non-executable superpages on vulnerable hardware. */ + opt_ept_exec_sp = !cpu_has_bug_pschange_mc; + + if ( cpu_has_bug_pschange_mc ) + printk("VMX: Disabling executable EPT superpages due to CVE-2018-12207\n"); + } + vmx_function_table.hap_supported = 1; vmx_function_table.altp2m_supported = 1; @@ -2473,7 +2593,7 @@ setup_vmcs_dump(); lbr_tsx_fixup_check(); - bdw_erratum_bdf14_fixup_check(); + bdf93_fixup_check(); return &vmx_function_table; } @@ -2641,14 +2761,6 @@ return X86EMUL_OKAY; } -/* This defines the layout of struct lbr_info[] */ -#define LBR_LASTINT_FROM_IDX 0 -#define LBR_LASTINT_TO_IDX 1 -#define LBR_LASTBRANCH_TOS_IDX 2 -#define LBR_LASTBRANCH_FROM_IDX 3 -#define LBR_LASTBRANCH_TO_IDX 4 -#define LBR_LASTBRANCH_INFO 5 - static const struct lbr_info { u32 base, count; } p4_lbr[] = { @@ -2780,52 +2892,76 @@ #define LBR_MSRS_INSERTED (1u << 0) #define LBR_FIXUP_TSX (1u << 1) -#define LBR_FIXUP_BDF14 (1u << 2) -#define LBR_FIXUP_MASK (LBR_FIXUP_TSX | LBR_FIXUP_BDF14) +#define LBR_FIXUP_BDF93 (1u << 2) +#define LBR_FIXUP_MASK (LBR_FIXUP_TSX | LBR_FIXUP_BDF93) static bool __read_mostly lbr_tsx_fixup_needed; -static bool __read_mostly bdw_erratum_bdf14_fixup_needed; -static uint32_t __read_mostly lbr_from_start; -static uint32_t __read_mostly lbr_from_end; -static uint32_t __read_mostly lbr_lastint_from; +static bool __read_mostly bdf93_fixup_needed; static void __init lbr_tsx_fixup_check(void) { - bool tsx_support = cpu_has_hle || cpu_has_rtm; uint64_t caps; uint32_t lbr_format; - /* Fixup is needed only when TSX support is disabled ... */ - if ( tsx_support ) + /* + * HSM182, HSD172, HSE117, BDM127, BDD117, BDF85, BDE105: + * + * On processors that do not support Intel Transactional Synchronization + * Extensions (Intel TSX) (CPUID.07H.EBX bits 4 and 11 are both zero), + * writes to MSR_LASTBRANCH_x_FROM_IP (MSR 680H to 68FH) may #GP unless + * bits[62:61] are equal to bit[47]. + * + * Software should sign extend the MSRs. + * + * Experimentally, MSR_LER_FROM_LIP (1DDH) is similarly impacted, so is + * fixed up as well. + */ + if ( cpu_has_hle || cpu_has_rtm || + boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + boot_cpu_data.x86 != 6 ) + return; + + switch ( boot_cpu_data.x86_model ) + { + case 0x3c: /* HSM182, HSD172 - 4th gen Core */ + case 0x3f: /* HSE117 - Xeon E5 v3 */ + case 0x45: /* HSM182 - 4th gen Core */ + case 0x46: /* HSM182, HSD172 - 4th gen Core (GT3) */ + case 0x3d: /* BDM127 - 5th gen Core */ + case 0x47: /* BDD117 - 5th gen Core (GT3) */ + case 0x4f: /* BDF85 - Xeon E5-2600 v4 */ + case 0x56: /* BDE105 - Xeon D-1500 */ + break; + default: return; + } + /* + * Fixup is needed only when TSX support is disabled and the address + * format of LBR includes TSX bits 61:62 + */ if ( !cpu_has_pdcm ) return; rdmsrl(MSR_IA32_PERF_CAPABILITIES, caps); lbr_format = caps & MSR_IA32_PERF_CAP_LBR_FORMAT; - /* ... and the address format of LBR includes TSX bits 61:62 */ if ( lbr_format == LBR_FORMAT_EIP_FLAGS_TSX ) - { - const struct lbr_info *lbr = last_branch_msr_get(); - - if ( lbr == NULL ) - return; - - lbr_lastint_from = lbr[LBR_LASTINT_FROM_IDX].base; - lbr_from_start = lbr[LBR_LASTBRANCH_FROM_IDX].base; - lbr_from_end = lbr_from_start + lbr[LBR_LASTBRANCH_FROM_IDX].count; - lbr_tsx_fixup_needed = true; - } } -static void __init bdw_erratum_bdf14_fixup_check(void) +static void __init bdf93_fixup_check(void) { - /* Broadwell E5-2600 v4 processors need to work around erratum BDF14. */ - if ( boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 79 ) - bdw_erratum_bdf14_fixup_needed = true; + /* + * Broadwell erratum BDF93: + * + * Reads from MSR_LER_TO_LIP (MSR 1DEH) may return values for bits[63:61] + * that are not equal to bit[47]. Attempting to context switch this value + * may cause a #GP. Software should sign extend the MSR. + */ + if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x4f ) + bdf93_fixup_needed = true; } static int is_last_branch_msr(u32 ecx) @@ -3190,8 +3326,8 @@ v->arch.hvm_vmx.lbr_flags |= LBR_MSRS_INSERTED; if ( lbr_tsx_fixup_needed ) v->arch.hvm_vmx.lbr_flags |= LBR_FIXUP_TSX; - if ( bdw_erratum_bdf14_fixup_needed ) - v->arch.hvm_vmx.lbr_flags |= LBR_FIXUP_BDF14; + if ( bdf93_fixup_needed ) + v->arch.hvm_vmx.lbr_flags |= LBR_FIXUP_BDF93; } __vmwrite(GUEST_IA32_DEBUGCTL, msr_content); @@ -3771,6 +3907,42 @@ HVMTRACE_1D(TRAP_DEBUG, exit_qualification); __restore_debug_registers(v); write_debugreg(6, exit_qualification | DR_STATUS_RESERVED_ONE); + + /* + * Work around SingleStep + STI/MovSS VMEntry failures. + * + * We intercept #DB unconditionally to work around CVE-2015-8104 / + * XSA-156 (guest-kernel induced host DoS). + * + * STI/MovSS shadows block/defer interrupts/exceptions (exact + * details are complicated and poorly documented). Debug + * exceptions delayed for any reason are stored in the + * PENDING_DBG_EXCEPTIONS field. + * + * The falling edge of PENDING_DBG causes #DB to be delivered, + * resulting in a VMExit, as #DB is intercepted. The VMCS still + * reports blocked-by-STI/MovSS. + * + * The VMEntry checks when EFLAGS.TF is set don't like a VMCS in + * this state. Despite a #DB queued in VMENTRY_INTR_INFO, the + * state is rejected as DR6.BS isn't pending. Fix this up. + */ + if ( unlikely(regs->eflags & X86_EFLAGS_TF) ) + { + unsigned long int_info; + + __vmread(GUEST_INTERRUPTIBILITY_INFO, &int_info); + + if ( int_info & (VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS) ) + { + unsigned long pending_dbg; + + __vmread(GUEST_PENDING_DBG_EXCEPTIONS, &pending_dbg); + __vmwrite(GUEST_PENDING_DBG_EXCEPTIONS, + pending_dbg | DR_STEP); + } + } + if ( !v->domain->debugger_attached ) { unsigned long insn_len = 0; @@ -3911,8 +4083,8 @@ __vmread(IDT_VECTORING_ERROR_CODE, &ecode); else ecode = -1; - regs->rip += inst_len; - hvm_task_switch((uint16_t)exit_qualification, reasons[source], ecode); + + hvm_task_switch(exit_qualification, reasons[source], ecode, inst_len); break; } case EXIT_REASON_CPUID: @@ -4205,8 +4377,12 @@ struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area; struct vmx_msr_entry *msr; - if ( (msr = vmx_find_msr(curr, lbr_from_start, VMX_MSR_GUEST)) != NULL ) + if ( (msr = vmx_find_msr(curr, MSR_P4_LASTBRANCH_0_FROM_LIP, + VMX_MSR_GUEST)) != NULL ) { + const unsigned int lbr_from_end = + MSR_P4_LASTBRANCH_0_FROM_LIP + NUM_MSR_P4_LASTBRANCH_FROM_TO; + /* * Sign extend into bits 61:62 while preserving bit 63 * The loop relies on the fact that MSR array is sorted. @@ -4215,7 +4391,8 @@ msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2); } - if ( (msr = vmx_find_msr(curr, lbr_lastint_from, VMX_MSR_GUEST)) != NULL ) + if ( (msr = vmx_find_msr(curr, MSR_IA32_LASTINTFROMIP, + VMX_MSR_GUEST)) != NULL ) msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2); } @@ -4232,20 +4409,10 @@ } } -static void bdw_erratum_bdf14_fixup(void) +static void bdf93_fixup(void) { struct vcpu *curr = current; - /* - * Occasionally, on certain Broadwell CPUs MSR_IA32_LASTINTTOIP has - * been observed to have the top three bits corrupted as though the - * MSR is using the LBR_FORMAT_EIP_FLAGS_TSX format. This is - * incorrect and causes a vmentry failure -- the MSR should contain - * an offset into the current code segment. This is assumed to be - * erratum BDF14. Fix up MSR_IA32_LASTINT{FROM,TO}IP by - * sign-extending into bits 48:63. - */ - sign_extend_msr(curr, MSR_IA32_LASTINTFROMIP, VMX_MSR_GUEST); sign_extend_msr(curr, MSR_IA32_LASTINTTOIP, VMX_MSR_GUEST); } @@ -4255,8 +4422,8 @@ if ( curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_TSX ) lbr_tsx_fixup(); - if ( curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_BDF14 ) - bdw_erratum_bdf14_fixup(); + if ( curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_BDF93 ) + bdf93_fixup(); } /* Returns false if the vmentry has to be restarted */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/vmx/vvmx.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/vmx/vvmx.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hvm/vmx/vvmx.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hvm/vmx/vvmx.c 2019-12-11 14:35:39.000000000 +0000 @@ -59,10 +59,23 @@ int nvmx_vcpu_initialise(struct vcpu *v) { + struct domain *d = v->domain; struct nestedvmx *nvmx = &vcpu_2_nvmx(v); struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); struct page_info *pg = alloc_domheap_page(NULL, 0); + /* + * Gross bodge. The nested p2m logic can't cope with the CVE-2018-12207 + * workaround of using NX EPT superpages, and livelocks. Nested HVM isn't + * security supported, so disable the workaround until the nested p2m + * logic can be improved. + */ + if ( !d->arch.hvm_domain.vmx.exec_sp ) + { + d->arch.hvm_domain.vmx.exec_sp = true; + p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_rw); + } + if ( !pg ) { gdprintk(XENLOG_ERR, "nest: allocation for shadow vmcs failed\n"); @@ -1059,11 +1072,11 @@ nvcpu->guest_cr[0] = get_vvmcs(v, CR0_READ_SHADOW); nvcpu->guest_cr[4] = get_vvmcs(v, CR4_READ_SHADOW); - rc = hvm_set_cr0(get_vvmcs(v, GUEST_CR0), 1); + rc = hvm_set_cr4(get_vvmcs(v, GUEST_CR4), 1); if ( rc == X86EMUL_EXCEPTION ) hvm_inject_hw_exception(TRAP_gp_fault, 0); - rc = hvm_set_cr4(get_vvmcs(v, GUEST_CR4), 1); + rc = hvm_set_cr0(get_vvmcs(v, GUEST_CR0), 1); if ( rc == X86EMUL_EXCEPTION ) hvm_inject_hw_exception(TRAP_gp_fault, 0); @@ -1273,11 +1286,11 @@ __vmwrite(vmcs_h2g_field[i].guest_field, r); } - rc = hvm_set_cr0(get_vvmcs(v, HOST_CR0), 1); + rc = hvm_set_cr4(get_vvmcs(v, HOST_CR4), 1); if ( rc == X86EMUL_EXCEPTION ) hvm_inject_hw_exception(TRAP_gp_fault, 0); - rc = hvm_set_cr4(get_vvmcs(v, HOST_CR4), 1); + rc = hvm_set_cr0(get_vvmcs(v, HOST_CR0), 1); if ( rc == X86EMUL_EXCEPTION ) hvm_inject_hw_exception(TRAP_gp_fault, 0); @@ -2483,6 +2496,7 @@ nvcpu->nv_vmexit_pending = 1; break; case EXIT_REASON_RDTSC: + case EXIT_REASON_RDTSCP: ctrl = __n2_exec_control(v); if ( ctrl & CPU_BASED_RDTSC_EXITING ) nvcpu->nv_vmexit_pending = 1; @@ -2493,6 +2507,8 @@ * avoiding changing guest_tsc and messing up timekeeping in L1 */ msr_split(regs, hvm_get_guest_tsc(v) + get_vvmcs(v, TSC_OFFSET)); + if ( exit_reason == EXIT_REASON_RDTSCP ) + regs->rcx = hvm_msr_tsc_aux(v); update_guest_eip(); return 1; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hypercall.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hypercall.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/hypercall.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/hypercall.c 2019-12-11 14:35:39.000000000 +0000 @@ -74,14 +74,15 @@ #undef COMP #undef ARGS -#define next_arg(fmt, args) ({ \ +#define NEXT_ARG(fmt, args) \ +({ \ unsigned long __arg; \ switch ( *(fmt)++ ) \ { \ case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \ case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \ case 'h': __arg = (unsigned long)va_arg(args, void *); break; \ - default: __arg = 0; BUG(); \ + default: goto bad_fmt; \ } \ __arg; \ }) @@ -103,7 +104,7 @@ if ( mcs->flags & MCSF_in_multicall ) { for ( i = 0; *p != '\0'; i++ ) - mcs->call.args[i] = next_arg(p, args); + mcs->call.args[i] = NEXT_ARG(p, args); } else { @@ -115,7 +116,7 @@ { for ( i = 0; *p != '\0'; i++ ) { - arg = next_arg(p, args); + arg = NEXT_ARG(p, args); switch ( i ) { case 0: regs->rdi = arg; break; @@ -131,7 +132,7 @@ { for ( i = 0; *p != '\0'; i++ ) { - arg = next_arg(p, args); + arg = NEXT_ARG(p, args); switch ( i ) { case 0: regs->rbx = arg; break; @@ -148,8 +149,17 @@ va_end(args); return op; + + bad_fmt: + va_end(args); + gprintk(XENLOG_ERR, "Bad hypercall continuation format '%c'\n", *p); + ASSERT_UNREACHABLE(); + domain_crash(curr->domain); + return 0; } +#undef NEXT_ARG + int hypercall_xlat_continuation(unsigned int *id, unsigned int nr, unsigned int mask, ...) { diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/io_apic.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/io_apic.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/io_apic.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/io_apic.c 2019-12-11 14:35:39.000000000 +0000 @@ -517,8 +517,9 @@ if (entry.irr) { /* Make sure the trigger mode is set to level. */ if (!entry.trigger) { + entry = __ioapic_read_entry(apic, pin, false); entry.trigger = 1; - __ioapic_write_entry(apic, pin, TRUE, entry); + __ioapic_write_entry(apic, pin, false, entry); } __io_apic_eoi(apic, entry.vector, pin); } @@ -528,7 +529,7 @@ */ memset(&entry, 0, sizeof(entry)); entry.mask = 1; - __ioapic_write_entry(apic, pin, TRUE, entry); + __ioapic_write_entry(apic, pin, false, entry); entry = __ioapic_read_entry(apic, pin, TRUE); if (entry.irr) diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/irq.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/irq.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/irq.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/irq.c 2019-12-11 14:35:39.000000000 +0000 @@ -679,7 +679,8 @@ * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR * to myself. */ - if (irr & (1 << (vector % 32))) { + if ( irr & (1u << (vector % 32)) ) + { send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); TRACE_3D(TRC_HW_IRQ_MOVE_CLEANUP_DELAY, irq, vector, smp_processor_id()); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/livepatch.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/livepatch.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/livepatch.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/livepatch.c 2019-12-11 14:35:39.000000000 +0000 @@ -10,10 +10,46 @@ #include #include #include +#include #include #include +static bool has_active_waitqueue(const struct vm_event_domain *ved) +{ + /* ved may be xzalloc()'d without INIT_LIST_HEAD() yet. */ + return (ved && !list_head_is_null(&ved->wq.list) && + !list_empty(&ved->wq.list)); +} + +/* + * x86's implementation of waitqueue violates the livepatching safey principle + * of having unwound every CPUs stack before modifying live content. + * + * Search through every domain and check that no vCPUs have an active + * waitqueue. + */ +int arch_livepatch_safety_check(void) +{ + struct domain *d; + + for_each_domain ( d ) + { + if ( has_active_waitqueue(d->vm_event_share) ) + goto fail; + if ( has_active_waitqueue(d->vm_event_paging) ) + goto fail; + if ( has_active_waitqueue(d->vm_event_monitor) ) + goto fail; + } + + return 0; + + fail: + printk(XENLOG_ERR LIVEPATCH "%pd found with active waitqueue\n", d); + return -EBUSY; +} + int arch_livepatch_quiesce(void) { /* Disable WP to allow changes to read-only pages. */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/microcode.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/microcode.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/microcode.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/microcode.c 2019-12-11 14:35:39.000000000 +0000 @@ -383,10 +383,15 @@ int __init early_microcode_update_cpu(bool start_update) { + unsigned int cpu = smp_processor_id(); + struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu); int rc = 0; void *data = NULL; size_t len; + if ( !microcode_ops ) + return -ENOSYS; + if ( ucode_blob.size ) { len = ucode_blob.size; @@ -397,6 +402,9 @@ len = ucode_mod.mod_end; data = bootstrap_map(&ucode_mod); } + + microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig); + if ( data ) { if ( start_update && microcode_ops->start_update ) @@ -413,6 +421,8 @@ int __init early_microcode_init(void) { + unsigned int cpu = smp_processor_id(); + struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu); int rc; rc = microcode_init_intel(); @@ -425,6 +435,8 @@ if ( microcode_ops ) { + microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig); + if ( ucode_mod.mod_end || ucode_blob.size ) rc = early_microcode_update_cpu(true); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/mm/p2m-ept.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/mm/p2m-ept.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/mm/p2m-ept.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/mm/p2m-ept.c 2019-12-11 14:35:39.000000000 +0000 @@ -215,6 +215,12 @@ break; } + /* + * Don't create executable superpages if we need to shatter them to + * protect against CVE-2018-12207. + */ + if ( !p2m->domain->arch.hvm_domain.vmx.exec_sp && is_epte_superpage(entry) ) + entry->x = 0; } #define GUEST_TABLE_MAP_FAILED 0 @@ -389,7 +395,8 @@ * present entries in the given page table, optionally marking the entries * also for their subtrees needing P2M type re-calculation. */ -static bool_t ept_invalidate_emt(mfn_t mfn, bool_t recalc, int level) +static bool_t ept_invalidate_emt(mfn_t mfn, bool_t recalc, + unsigned int parent_level) { int rc; ept_entry_t *epte = map_domain_page(mfn); @@ -407,7 +414,7 @@ e.emt = MTRR_NUM_TYPES; if ( recalc ) e.recalc = 1; - rc = atomic_write_ept_entry(&epte[i], e, level); + rc = atomic_write_ept_entry(&epte[i], e, parent_level - 1); ASSERT(rc == 0); changed = 1; } diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/mm/p2m.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/mm/p2m.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/mm/p2m.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/mm/p2m.c 2019-12-11 14:35:39.000000000 +0000 @@ -257,17 +257,22 @@ return 0; } +/* + * May be called with ot = nt = p2m_ram_rw for its side effect of + * recalculating all PTEs in the p2m. + */ void p2m_change_entry_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt) { struct p2m_domain *p2m = p2m_get_hostp2m(d); - ASSERT(ot != nt); ASSERT(p2m_is_changeable(ot) && p2m_is_changeable(nt)); p2m_lock(p2m); p2m->change_entry_type_global(p2m, ot, nt); - p2m->global_logdirty = (nt == p2m_ram_logdirty); + /* Don't allow 'recalculate' operations to change the logdirty state. */ + if ( ot != nt ) + p2m->global_logdirty = (nt == p2m_ram_logdirty); p2m_unlock(p2m); } diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/mm/shadow/common.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/mm/shadow/common.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/mm/shadow/common.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/mm/shadow/common.c 2019-12-11 14:35:39.000000000 +0000 @@ -3506,7 +3506,8 @@ mode |= PG_SH_enable; - if ( d->arch.paging.shadow.total_pages == 0 ) + if ( d->arch.paging.shadow.total_pages < + sh_min_allocation(d) + d->arch.paging.shadow.p2m_pages ) { /* Init the shadow memory allocation if the user hasn't done so */ if ( shadow_set_allocation(d, 1, NULL) != 0 ) diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/mm.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/mm.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/mm.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/mm.c 2019-12-11 14:35:39.000000000 +0000 @@ -295,9 +295,11 @@ * Initialise our DOMID_IO domain. * This domain owns I/O pages that are within the range of the page_info * array. Mappings occur at the priv of the caller. + * Quarantined PCI devices will be associated with this domain. */ dom_io = domain_create(DOMID_IO, NULL); BUG_ON(IS_ERR(dom_io)); + INIT_LIST_HEAD(&dom_io->arch.pdev_list); /* * Initialise our COW domain. @@ -610,21 +612,74 @@ static int _get_page_type(struct page_info *page, unsigned long type, bool preemptible); +/* + * The following flags are used to specify behavior of various get and + * put commands. The first is also stored in page->partial_flags to + * indicate the state of the page pointed to by + * page->pte[page->nr_validated_entries]. See the comment in mm.h for + * more information. + */ +#define PTF_partial_set (1 << 0) +#define PTF_preemptible (1 << 2) +#define PTF_defer (1 << 3) +#define PTF_retain_ref_on_restart (1 << 4) + static int get_page_and_type_from_mfn( mfn_t mfn, unsigned long type, struct domain *d, - int partial, int preemptible) + unsigned int flags) { struct page_info *page = mfn_to_page(mfn); int rc; + bool preemptible = flags & PTF_preemptible, + partial_set = flags & PTF_partial_set, + retain_ref = flags & PTF_retain_ref_on_restart; - if ( likely(partial >= 0) && + if ( likely(!partial_set) && unlikely(!get_page_from_mfn(mfn, d)) ) return -EINVAL; rc = _get_page_type(page, type, preemptible); - if ( unlikely(rc) && partial >= 0 && - (!preemptible || page != current->arch.old_guest_table) ) + /* + * Retain the refcount if: + * - page is fully validated (rc == 0) + * - page is not validated (rc < 0) but: + * - We came in with a reference (partial_set) + * - page is partially validated (rc == -ERESTART), and the + * caller has asked the ref to be retained in that case + * - page is partially validated but there's been an error + * (page == current->arch.old_guest_table) + * + * The partial_set-on-error clause is worth an explanation. There + * are two scenarios where partial_set might be true coming in: + * - mfn has been partially promoted / demoted as type `type`; + * i.e. has PGT_partial set + * - mfn has been partially demoted as L(type+1) (i.e., a linear + * page; e.g. we're being called from get_page_from_l2e with + * type == PGT_l1_table, but the mfn is PGT_l2_table) + * + * If there's an error, in the first case, _get_page_type will + * either return -ERESTART, in which case we want to retain the + * ref (as the caller will consider it retained), or -EINVAL, in + * which case old_guest_table will be set; in both cases, we need + * to retain the ref. + * + * In the second case, if there's an error, _get_page_type() can + * *only* return -EINVAL, and *never* set old_guest_table. In + * that case we also want to retain the reference, to allow the + * page to continue to be torn down (i.e., PGT_partial cleared) + * safely. + * + * Also note that we shouldn't be able to leave with the reference + * count retained unless we succeeded, or the operation was + * preemptible. + */ + if ( likely(!rc) || partial_set ) + /* nothing */; + else if ( page == current->arch.old_guest_table || + (retain_ref && rc == -ERESTART) ) + ASSERT(preemptible); + else put_page(page); return rc; @@ -1104,13 +1159,13 @@ define_get_linear_pagetable(l2); static int get_page_from_l2e( - l2_pgentry_t l2e, unsigned long pfn, struct domain *d, int partial) + l2_pgentry_t l2e, unsigned long pfn, struct domain *d, unsigned int flags) { unsigned long mfn = l2e_get_pfn(l2e); int rc; if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) - return pv_l1tf_check_l2e(d, l2e) ? -ERESTART : 1; + return pv_l1tf_check_l2e(d, l2e) ? -EINTR : 1; if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) ) { @@ -1119,8 +1174,9 @@ return -EINVAL; } - rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, - partial, false); + ASSERT(!(flags & PTF_preemptible)); + + rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, flags); if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) ) rc = 0; @@ -1137,12 +1193,12 @@ define_get_linear_pagetable(l3); static int get_page_from_l3e( - l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial) + l3_pgentry_t l3e, unsigned long pfn, struct domain *d, unsigned int flags) { int rc; if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) - return pv_l1tf_check_l3e(d, l3e) ? -ERESTART : 1; + return pv_l1tf_check_l3e(d, l3e) ? -EINTR : 1; if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) ) { @@ -1152,7 +1208,7 @@ } rc = get_page_and_type_from_mfn( - l3e_get_mfn(l3e), PGT_l2_page_table, d, partial, 1); + l3e_get_mfn(l3e), PGT_l2_page_table, d, flags | PTF_preemptible); if ( unlikely(rc == -EINVAL) && !is_pv_32bit_domain(d) && get_l3_linear_pagetable(l3e, pfn, d) ) @@ -1170,12 +1226,12 @@ define_get_linear_pagetable(l4); static int get_page_from_l4e( - l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial) + l4_pgentry_t l4e, unsigned long pfn, struct domain *d, unsigned int flags) { int rc; if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) - return pv_l1tf_check_l4e(d, l4e) ? -ERESTART : 1; + return pv_l1tf_check_l4e(d, l4e) ? -EINTR : 1; if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) ) { @@ -1185,14 +1241,14 @@ } rc = get_page_and_type_from_mfn( - l4e_get_mfn(l4e), PGT_l3_page_table, d, partial, 1); + l4e_get_mfn(l4e), PGT_l3_page_table, d, flags | PTF_preemptible); if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) ) rc = 0; return rc; } -static int _put_page_type(struct page_info *page, bool preemptible, +static int _put_page_type(struct page_info *page, unsigned int flags, struct page_info *ptpg); void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner) @@ -1275,7 +1331,7 @@ * Note also that this automatically deals correctly with linear p.t.'s. */ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, - int partial, bool defer) + unsigned int flags) { int rc = 0; @@ -1295,19 +1351,15 @@ struct page_info *pg = l2e_get_page(l2e); struct page_info *ptpg = mfn_to_page(_mfn(pfn)); - if ( unlikely(partial > 0) ) - { - ASSERT(!defer); - rc = _put_page_type(pg, true, ptpg); - } - else if ( defer ) + if ( flags & PTF_defer ) { current->arch.old_guest_ptpg = ptpg; current->arch.old_guest_table = pg; + current->arch.old_guest_table_partial = false; } else { - rc = _put_page_type(pg, true, ptpg); + rc = _put_page_type(pg, flags | PTF_preemptible, ptpg); if ( likely(!rc) ) put_page(pg); } @@ -1317,7 +1369,7 @@ } static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, - int partial, bool defer) + unsigned int flags) { struct page_info *pg; int rc; @@ -1330,6 +1382,7 @@ unsigned long mfn = l3e_get_pfn(l3e); int writeable = l3e_get_flags(l3e) & _PAGE_RW; + ASSERT(!(flags & PTF_partial_set)); ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))); do { put_data_page(mfn_to_page(_mfn(mfn)), writeable); @@ -1340,20 +1393,16 @@ pg = l3e_get_page(l3e); - if ( unlikely(partial > 0) ) - { - ASSERT(!defer); - return _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); - } - - if ( defer ) + if ( flags & PTF_defer ) { + ASSERT(!(flags & PTF_partial_set)); current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); current->arch.old_guest_table = pg; + current->arch.old_guest_table_partial = false; return 0; } - rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); + rc = _put_page_type(pg, flags | PTF_preemptible, mfn_to_page(_mfn(pfn))); if ( likely(!rc) ) put_page(pg); @@ -1361,7 +1410,7 @@ } static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, - int partial, bool defer) + unsigned int flags) { int rc = 1; @@ -1370,20 +1419,17 @@ { struct page_info *pg = l4e_get_page(l4e); - if ( unlikely(partial > 0) ) - { - ASSERT(!defer); - return _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); - } - - if ( defer ) + if ( flags & PTF_defer ) { + ASSERT(!(flags & PTF_partial_set)); current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); current->arch.old_guest_table = pg; + current->arch.old_guest_table_partial = false; return 0; } - rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); + rc = _put_page_type(pg, flags | PTF_preemptible, + mfn_to_page(_mfn(pfn))); if ( likely(!rc) ) put_page(pg); } @@ -1404,7 +1450,7 @@ { if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) ) { - ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -ERESTART : 0; + ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -EINTR : 0; if ( ret ) goto out; } @@ -1483,44 +1529,74 @@ unsigned long pfn = mfn_x(page_to_mfn(page)); l2_pgentry_t *pl2e; unsigned int i; - int rc = 0, partial = page->partial_pte; + int rc = 0; + unsigned int partial_flags = page->partial_flags; pl2e = map_domain_page(_mfn(pfn)); + /* + * NB that alloc_l2_table will never set partial_pte on an l2; but + * free_l2_table might if a linear_pagetable entry is interrupted + * partway through de-validation. In that circumstance, + * get_page_from_l2e() will always return -EINVAL; and we must + * retain the type ref by doing the normal partial_flags tracking. + */ + for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; - i++, partial = 0 ) + i++, partial_flags = 0 ) { if ( i > page->nr_validated_ptes && hypercall_preempt_check() ) - { - page->nr_validated_ptes = i; - rc = -ERESTART; - break; - } - - if ( !is_guest_l2_slot(d, type, i) || - (rc = get_page_from_l2e(pl2e[i], pfn, d, partial)) > 0 ) + rc = -EINTR; + else if ( !is_guest_l2_slot(d, type, i) || + (rc = get_page_from_l2e(pl2e[i], pfn, d, partial_flags)) > 0 ) continue; - if ( rc == -ERESTART ) - { - page->nr_validated_ptes = i; - page->partial_pte = partial ?: 1; - } - else if ( rc == -EINTR && i ) + /* + * It shouldn't be possible for get_page_from_l2e to return + * -ERESTART, since we never call this with PTF_preemptible. + * (alloc_l1_table may return -EINTR on an L1TF-vulnerable + * entry.) + * + * NB that while on a "clean" promotion, we can never get + * PGT_partial. It is possible to arrange for an l2e to + * contain a partially-devalidated l2; but in that case, both + * of the following functions will fail anyway (the first + * because the page in question is not an l1; the second + * because the page is not fully validated). + */ + ASSERT(rc != -ERESTART); + + if ( rc == -EINTR && i ) { page->nr_validated_ptes = i; - page->partial_pte = 0; + page->partial_flags = partial_flags;; rc = -ERESTART; } else if ( rc < 0 && rc != -EINTR ) { gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i); + ASSERT(current->arch.old_guest_table == NULL); if ( i ) { + /* + * alloc_l1_table() doesn't set old_guest_table; it does + * its own tear-down immediately on failure. If it + * did we'd need to check it and set partial_flags as we + * do in alloc_l[34]_table(). + * + * Note on the use of ASSERT: if it's non-null and + * hasn't been cleaned up yet, it should have + * PGT_partial set; and so the type will be cleaned up + * on domain destruction. Unfortunately, we would + * leak the general ref held by old_guest_table; but + * leaking a page is less bad than a host crash. + */ + ASSERT(current->arch.old_guest_table == NULL); page->nr_validated_ptes = i; - page->partial_pte = 0; + page->partial_flags = partial_flags; current->arch.old_guest_ptpg = NULL; current->arch.old_guest_table = page; + current->arch.old_guest_table_partial = true; } } if ( rc < 0 ) @@ -1542,7 +1618,9 @@ unsigned long pfn = mfn_x(page_to_mfn(page)); l3_pgentry_t *pl3e; unsigned int i; - int rc = 0, partial = page->partial_pte; + int rc = 0; + unsigned int partial_flags = page->partial_flags; + l3_pgentry_t l3e = l3e_empty(); pl3e = map_domain_page(_mfn(pfn)); @@ -1557,16 +1635,11 @@ memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e)); for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; - i++, partial = 0 ) + i++, partial_flags = 0 ) { if ( i > page->nr_validated_ptes && hypercall_preempt_check() ) - { - page->nr_validated_ptes = i; - rc = -ERESTART; - break; - } - - if ( is_pv_32bit_domain(d) && (i == 3) ) + rc = -EINTR; + else if ( is_pv_32bit_domain(d) && (i == 3) ) { if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) || (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ) @@ -1574,24 +1647,31 @@ else rc = get_page_and_type_from_mfn( l3e_get_mfn(pl3e[i]), - PGT_l2_page_table | PGT_pae_xen_l2, d, partial, 1); + PGT_l2_page_table | PGT_pae_xen_l2, d, + partial_flags | PTF_preemptible | PTF_retain_ref_on_restart); } - else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial)) > 0 ) + else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, + partial_flags | PTF_retain_ref_on_restart)) > 0 ) continue; if ( rc == -ERESTART ) { page->nr_validated_ptes = i; - page->partial_pte = partial ?: 1; + /* Set 'set', leave 'general ref' set if this entry was set */ + page->partial_flags = PTF_partial_set; } else if ( rc == -EINTR && i ) { page->nr_validated_ptes = i; - page->partial_pte = 0; + page->partial_flags = partial_flags; rc = -ERESTART; } if ( rc < 0 ) + { + /* XSA-299 Backport: Copy l3e for checking */ + l3e = pl3e[i]; break; + } pl3e[i] = adjust_guest_l3e(pl3e[i], d); } @@ -1604,9 +1684,31 @@ if ( i ) { page->nr_validated_ptes = i; - page->partial_pte = 0; + page->partial_flags = partial_flags; + if ( current->arch.old_guest_table ) + { + /* + * We've experienced a validation failure. If + * old_guest_table is set, "transfer" the general + * reference count to pl3e[nr_validated_ptes] by + * setting PTF_partial_set. + * + * As a precaution, check that old_guest_table is the + * page pointed to by pl3e[nr_validated_ptes]. If + * not, it's safer to leak a type ref on production + * builds. + */ + if ( current->arch.old_guest_table == l3e_get_page(l3e) ) + { + ASSERT(current->arch.old_guest_table_partial); + page->partial_flags = PTF_partial_set; + } + else + ASSERT_UNREACHABLE(); + } current->arch.old_guest_ptpg = NULL; current->arch.old_guest_table = page; + current->arch.old_guest_table_partial = true; } while ( i-- > 0 ) pl3e[i] = unadjust_guest_l3e(pl3e[i], d); @@ -1736,19 +1838,22 @@ unsigned long pfn = mfn_x(page_to_mfn(page)); l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn)); unsigned int i; - int rc = 0, partial = page->partial_pte; + int rc = 0; + unsigned int partial_flags = page->partial_flags; for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; - i++, partial = 0 ) + i++, partial_flags = 0 ) { if ( !is_guest_l4_slot(d, i) || - (rc = get_page_from_l4e(pl4e[i], pfn, d, partial)) > 0 ) + (rc = get_page_from_l4e(pl4e[i], pfn, d, + partial_flags | PTF_retain_ref_on_restart)) > 0 ) continue; if ( rc == -ERESTART ) { page->nr_validated_ptes = i; - page->partial_pte = partial ?: 1; + /* Set 'set', leave 'general ref' set if this entry was set */ + page->partial_flags = PTF_partial_set; } else if ( rc < 0 ) { @@ -1758,15 +1863,35 @@ if ( i ) { page->nr_validated_ptes = i; - page->partial_pte = 0; + page->partial_flags = partial_flags; if ( rc == -EINTR ) rc = -ERESTART; else { if ( current->arch.old_guest_table ) - page->nr_validated_ptes++; + { + /* + * We've experienced a validation failure. If + * old_guest_table is set, "transfer" the general + * reference count to pl3e[nr_validated_ptes] by + * setting PTF_partial_set. + * + * As a precaution, check that old_guest_table is the + * page pointed to by pl4e[nr_validated_ptes]. If + * not, it's safer to leak a type ref on production + * builds. + */ + if ( current->arch.old_guest_table == l4e_get_page(pl4e[i]) ) + { + ASSERT(current->arch.old_guest_table_partial); + page->partial_flags = PTF_partial_set; + } + else + ASSERT_UNREACHABLE(); + } current->arch.old_guest_ptpg = NULL; current->arch.old_guest_table = page; + current->arch.old_guest_table_partial = true; } } } @@ -1811,19 +1936,20 @@ struct domain *d = page_get_owner(page); unsigned long pfn = mfn_x(page_to_mfn(page)); l2_pgentry_t *pl2e; - int rc = 0, partial = page->partial_pte; - unsigned int i = page->nr_validated_ptes - !partial; + int rc = 0; + unsigned int partial_flags = page->partial_flags, + i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set); pl2e = map_domain_page(_mfn(pfn)); for ( ; ; ) { if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) ) - rc = put_page_from_l2e(pl2e[i], pfn, partial, false); + rc = put_page_from_l2e(pl2e[i], pfn, partial_flags); if ( rc < 0 ) break; - partial = 0; + partial_flags = 0; if ( !i-- ) break; @@ -1845,12 +1971,12 @@ else if ( rc == -ERESTART ) { page->nr_validated_ptes = i; - page->partial_pte = partial ?: -1; + page->partial_flags = PTF_partial_set; } else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 ) { - page->nr_validated_ptes = i + 1; - page->partial_pte = 0; + page->nr_validated_ptes = i + !(partial_flags & PTF_partial_set); + page->partial_flags = partial_flags; rc = -ERESTART; } @@ -1862,18 +1988,19 @@ struct domain *d = page_get_owner(page); unsigned long pfn = mfn_x(page_to_mfn(page)); l3_pgentry_t *pl3e; - int rc = 0, partial = page->partial_pte; - unsigned int i = page->nr_validated_ptes - !partial; + int rc = 0; + unsigned int partial_flags = page->partial_flags, + i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set); pl3e = map_domain_page(_mfn(pfn)); for ( ; ; ) { - rc = put_page_from_l3e(pl3e[i], pfn, partial, 0); + rc = put_page_from_l3e(pl3e[i], pfn, partial_flags); if ( rc < 0 ) break; - partial = 0; + partial_flags = 0; if ( rc == 0 ) pl3e[i] = unadjust_guest_l3e(pl3e[i], d); @@ -1892,12 +2019,12 @@ if ( rc == -ERESTART ) { page->nr_validated_ptes = i; - page->partial_pte = partial ?: -1; + page->partial_flags = PTF_partial_set; } else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) { - page->nr_validated_ptes = i + 1; - page->partial_pte = 0; + page->nr_validated_ptes = i + !(partial_flags & PTF_partial_set); + page->partial_flags = partial_flags; rc = -ERESTART; } return rc > 0 ? 0 : rc; @@ -1908,26 +2035,27 @@ struct domain *d = page_get_owner(page); unsigned long pfn = mfn_x(page_to_mfn(page)); l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn)); - int rc = 0, partial = page->partial_pte; - unsigned int i = page->nr_validated_ptes - !partial; + int rc = 0; + unsigned partial_flags = page->partial_flags, + i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set); do { if ( is_guest_l4_slot(d, i) ) - rc = put_page_from_l4e(pl4e[i], pfn, partial, 0); + rc = put_page_from_l4e(pl4e[i], pfn, partial_flags); if ( rc < 0 ) break; - partial = 0; + partial_flags = 0; } while ( i-- ); if ( rc == -ERESTART ) { page->nr_validated_ptes = i; - page->partial_pte = partial ?: -1; + page->partial_flags = PTF_partial_set; } else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) { - page->nr_validated_ptes = i + 1; - page->partial_pte = 0; + page->nr_validated_ptes = i + !(partial_flags & PTF_partial_set); + page->partial_flags = partial_flags; rc = -ERESTART; } @@ -2203,7 +2331,7 @@ return -EBUSY; } - put_page_from_l2e(ol2e, pfn, 0, true); + put_page_from_l2e(ol2e, pfn, PTF_defer); return rc; } @@ -2271,7 +2399,7 @@ if ( !create_pae_xen_mappings(d, pl3e) ) BUG(); - put_page_from_l3e(ol3e, pfn, 0, 1); + put_page_from_l3e(ol3e, pfn, PTF_defer); return rc; } @@ -2334,7 +2462,7 @@ return -EFAULT; } - put_page_from_l4e(ol4e, pfn, 0, 1); + put_page_from_l4e(ol4e, pfn, PTF_defer); return rc; } @@ -2598,7 +2726,7 @@ if ( !(type & PGT_partial) ) { page->nr_validated_ptes = 1U << PAGETABLE_ORDER; - page->partial_pte = 0; + page->partial_flags = 0; } switch ( type & PGT_type_mask ) @@ -2635,14 +2763,17 @@ { int rc = free_page_type(page, type, preemptible); + if ( ptpg && PGT_type_equal(type, ptpg->u.inuse.type_info) && + (type & PGT_validated) && rc != -EINTR ) + { + /* Any time we begin de-validation of a page, adjust linear counts */ + dec_linear_uses(page); + dec_linear_entries(ptpg); + } + /* No need for atomic update of type_info here: noone else updates it. */ if ( rc == 0 ) { - if ( ptpg && PGT_type_equal(type, ptpg->u.inuse.type_info) ) - { - dec_linear_uses(page); - dec_linear_entries(ptpg); - } ASSERT(!page->linear_pt_count || page_get_owner(page)->is_dying); set_tlbflush_timestamp(page); smp_wmb(); @@ -2667,10 +2798,11 @@ } -static int _put_page_type(struct page_info *page, bool preemptible, +static int _put_page_type(struct page_info *page, unsigned int flags, struct page_info *ptpg) { unsigned long nx, x, y = page->u.inuse.type_info; + bool preemptible = flags & PTF_preemptible; ASSERT(current_locked_page_ne_check(page)); @@ -2679,6 +2811,28 @@ x = y; nx = x - 1; + /* + * Is this expected to do a full reference drop, or only + * cleanup partial validation / devalidation? + * + * If the former, the caller must hold a "full" type ref; + * which means the page must be validated. If the page is + * *not* fully validated, continuing would almost certainly + * open up a security hole. An exception to this is during + * domain destruction, where PGT_validated can be dropped + * without dropping a type ref. + * + * If the latter, do nothing unless type PGT_partial is set. + * If it is set, the type count must be 1. + */ + if ( !(flags & PTF_partial_set) ) + BUG_ON((x & PGT_partial) || + !((x & PGT_validated) || page_get_owner(page)->is_dying)); + else if ( !(x & PGT_partial) ) + return 0; + else + BUG_ON((x & PGT_count_mask) != 1); + ASSERT((x & PGT_count_mask) != 0); switch ( nx & (PGT_locked | PGT_count_mask) ) @@ -2877,7 +3031,7 @@ if ( unlikely(iommu_ret) ) { - _put_page_type(page, false, NULL); + _put_page_type(page, 0, NULL); rc = iommu_ret; goto out; } @@ -2889,9 +3043,9 @@ if ( !(x & PGT_partial) ) { page->nr_validated_ptes = 0; - page->partial_pte = 0; + page->partial_flags = 0; + page->linear_pt_count = 0; } - page->linear_pt_count = 0; rc = alloc_page_type(page, type, preemptible); } @@ -2904,7 +3058,7 @@ void put_page_type(struct page_info *page) { - int rc = _put_page_type(page, false, NULL); + int rc = _put_page_type(page, 0, NULL); ASSERT(rc == 0); (void)rc; } @@ -2921,7 +3075,7 @@ int put_page_type_preemptible(struct page_info *page) { - return _put_page_type(page, true, NULL); + return _put_page_type(page, PTF_preemptible, NULL); } int get_page_type_preemptible(struct page_info *page, unsigned long type) @@ -2938,17 +3092,34 @@ if ( !v->arch.old_guest_table ) return 0; - switch ( rc = _put_page_type(v->arch.old_guest_table, true, - v->arch.old_guest_ptpg) ) + rc = _put_page_type(v->arch.old_guest_table, + PTF_preemptible | + ( v->arch.old_guest_table_partial ? + PTF_partial_set : 0 ), + v->arch.old_guest_ptpg); + + if ( rc == -ERESTART || rc == -EINTR ) { - case -EINTR: - case -ERESTART: + v->arch.old_guest_table_partial = (rc == -ERESTART); return -ERESTART; - case 0: - put_page(v->arch.old_guest_table); } + /* + * It shouldn't be possible for _put_page_type() to return + * anything else at the moment; but if it does happen in + * production, leaking the type ref is probably the best thing to + * do. Either way, drop the general ref held by old_guest_table. + */ + ASSERT(rc == 0); + + put_page(v->arch.old_guest_table); v->arch.old_guest_table = NULL; + v->arch.old_guest_ptpg = NULL; + /* + * Safest default if someone sets old_guest_table without + * explicitly setting old_guest_table_partial. + */ + v->arch.old_guest_table_partial = true; return rc; } @@ -2956,40 +3127,36 @@ int vcpu_destroy_pagetables(struct vcpu *v) { unsigned long mfn = pagetable_get_pfn(v->arch.guest_table); - struct page_info *page; - l4_pgentry_t *l4tab = NULL; + struct page_info *page = NULL; int rc = put_old_guest_table(v); + bool put_guest_table_user = false; if ( rc ) return rc; - if ( is_pv_32bit_vcpu(v) ) - { - l4tab = map_domain_page(_mfn(mfn)); - mfn = l4e_get_pfn(*l4tab); - } + v->arch.cr3 = 0; - if ( mfn ) + /* + * Get the top-level guest page; either the guest_table itself, for + * 64-bit, or the top-level l4 entry for 32-bit. Either way, remove + * the reference to that page. + */ + if ( is_pv_32bit_vcpu(v) ) { - page = mfn_to_page(_mfn(mfn)); - if ( paging_mode_refcounts(v->domain) ) - put_page(page); - else - rc = put_page_and_type_preemptible(page); - } + l4_pgentry_t *l4tab = map_domain_page(_mfn(mfn)); - if ( l4tab ) - { - if ( !rc ) - l4e_write(l4tab, l4e_empty()); + mfn = l4e_get_pfn(*l4tab); + l4e_write(l4tab, l4e_empty()); unmap_domain_page(l4tab); } - else if ( !rc ) + else { v->arch.guest_table = pagetable_null(); + put_guest_table_user = true; + } - /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */ - mfn = pagetable_get_pfn(v->arch.guest_table_user); + /* Free that page if non-zero */ + do { if ( mfn ) { page = mfn_to_page(_mfn(mfn)); @@ -2997,18 +3164,41 @@ put_page(page); else rc = put_page_and_type_preemptible(page); + mfn = 0; } - if ( !rc ) - v->arch.guest_table_user = pagetable_null(); - } - v->arch.cr3 = 0; + if ( !rc && put_guest_table_user ) + { + /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */ + mfn = pagetable_get_pfn(v->arch.guest_table_user); + v->arch.guest_table_user = pagetable_null(); + put_guest_table_user = false; + } + } while ( mfn ); /* - * put_page_and_type_preemptible() is liable to return -EINTR. The - * callers of us expect -ERESTART so convert it over. + * If a "put" operation was interrupted, finish things off in + * put_old_guest_table() when the operation is restarted. */ - return rc != -EINTR ? rc : -ERESTART; + switch ( rc ) + { + case -EINTR: + case -ERESTART: + v->arch.old_guest_ptpg = NULL; + v->arch.old_guest_table = page; + v->arch.old_guest_table_partial = (rc == -ERESTART); + rc = -ERESTART; + break; + default: + /* + * Failure to 'put' a page may cause it to leak, but that's + * less bad than a crash. + */ + ASSERT(rc == 0); + break; + } + + return rc; } int new_guest_cr3(mfn_t mfn) @@ -3064,7 +3254,7 @@ return 0; } - rc = get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, 0, 1); + rc = get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, PTF_preemptible); switch ( rc ) { case 0: @@ -3098,11 +3288,11 @@ switch ( rc = put_page_and_type_preemptible(page) ) { case -EINTR: - rc = -ERESTART; - /* fallthrough */ case -ERESTART: curr->arch.old_guest_ptpg = NULL; curr->arch.old_guest_table = page; + curr->arch.old_guest_table_partial = (rc == -ERESTART); + rc = -ERESTART; break; default: BUG_ON(rc); @@ -3376,6 +3566,7 @@ { curr->arch.old_guest_ptpg = NULL; curr->arch.old_guest_table = page; + curr->arch.old_guest_table_partial = false; } } } @@ -3410,6 +3601,11 @@ case -ERESTART: curr->arch.old_guest_ptpg = NULL; curr->arch.old_guest_table = page; + /* + * EINTR means we still hold the type ref; ERESTART + * means PGT_partial holds the type ref + */ + curr->arch.old_guest_table_partial = (rc == -ERESTART); rc = 0; break; default: @@ -3452,7 +3648,7 @@ if ( op.arg1.mfn != 0 ) { rc = get_page_and_type_from_mfn( - _mfn(op.arg1.mfn), PGT_root_page_table, currd, 0, 1); + _mfn(op.arg1.mfn), PGT_root_page_table, currd, PTF_preemptible); if ( unlikely(rc) ) { @@ -3478,11 +3674,15 @@ switch ( rc = put_page_and_type_preemptible(page) ) { case -EINTR: - rc = -ERESTART; - /* fallthrough */ case -ERESTART: curr->arch.old_guest_ptpg = NULL; curr->arch.old_guest_table = page; + /* + * EINTR means we still hold the type ref; + * ERESTART means PGT_partial holds the ref + */ + curr->arch.old_guest_table_partial = (rc == -ERESTART); + rc = -ERESTART; break; default: BUG_ON(rc); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/monitor.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/monitor.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/monitor.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/monitor.c 2019-12-11 14:35:39.000000000 +0000 @@ -288,9 +288,6 @@ ad->monitor.debug_exception_sync = requested_status ? mop->u.debug_exception.sync : 0; - - hvm_set_icebp_interception(d, requested_status); - domain_unpause(d); break; } diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/msi.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/msi.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/msi.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/msi.c 2019-12-11 14:35:39.000000000 +0000 @@ -1268,6 +1268,31 @@ msi_free_irqs(pdev); } +int pci_reset_msix_state(struct pci_dev *pdev) +{ + uint8_t slot = PCI_SLOT(pdev->devfn); + uint8_t func = PCI_FUNC(pdev->devfn); + unsigned int pos = pci_find_cap_offset(pdev->seg, pdev->bus, slot, func, + PCI_CAP_ID_MSIX); + + ASSERT(pos); + /* + * Xen expects the device state to be the after reset one, and hence + * host_maskall = guest_maskall = false and all entries should have the + * mask bit set. Test that the maskall bit is not set, having it set could + * signal that the device hasn't been reset properly. + */ + if ( pci_conf_read16(pdev->seg, pdev->bus, slot, func, + msix_control_reg(pos)) & + PCI_MSIX_FLAGS_MASKALL ) + return -EBUSY; + + pdev->msix->host_maskall = false; + pdev->msix->guest_maskall = false; + + return 0; +} + int pci_msi_conf_write_intercept(struct pci_dev *pdev, unsigned int reg, unsigned int size, uint32_t *data) { @@ -1304,6 +1329,7 @@ { uint16_t cntl; uint32_t unused; + unsigned int nvec = entry->msi.nvec; pos = entry->msi_attrib.pos; if ( reg < pos || reg >= entry->msi.mpos + 8 ) @@ -1316,7 +1342,7 @@ cntl = pci_conf_read16(seg, bus, slot, func, msi_control_reg(pos)); unused = ~(uint32_t)0 >> (32 - multi_msi_capable(cntl)); - for ( pos = 0; pos < entry->msi.nvec; ++pos, ++entry ) + for ( pos = 0; pos < nvec; ++pos, ++entry ) { entry->msi_attrib.guest_masked = *data >> entry->msi_attrib.entry_nr; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/msr.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/msr.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/msr.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/msr.c 2019-12-11 14:35:39.000000000 +0000 @@ -153,9 +153,32 @@ case MSR_FLUSH_CMD: /* Write-only */ case MSR_TSX_FORCE_ABORT: + case MSR_TSX_CTRL: /* Not offered to guests. */ goto gp_fault; + case MSR_AMD_PATCHLEVEL: + BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL); + /* + * AMD and Intel use the same MSR for the current microcode version. + * + * There is no need to jump through the SDM-provided hoops for Intel. + * A guest might itself perform the "write 0, CPUID, read" sequence, + * but servicing the CPUID for the guest typically wont result in + * actually executing a CPUID instruction. + * + * As a guest can't influence the value of this MSR, the value will be + * from Xen's last microcode load, which can be forwarded straight to + * the guest. + */ + if ( (cp->x86_vendor != X86_VENDOR_INTEL && + cp->x86_vendor != X86_VENDOR_AMD) || + (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && + boot_cpu_data.x86_vendor != X86_VENDOR_AMD) || + rdmsr_safe(MSR_AMD_PATCHLEVEL, *val) ) + goto gp_fault; + break; + case MSR_SPEC_CTRL: if ( !cp->feat.ibrsb ) goto gp_fault; @@ -211,9 +234,23 @@ case MSR_ARCH_CAPABILITIES: /* Read-only */ case MSR_TSX_FORCE_ABORT: + case MSR_TSX_CTRL: /* Not offered to guests. */ goto gp_fault; + case MSR_AMD_PATCHLEVEL: + BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL); + /* + * AMD and Intel use the same MSR for the current microcode version. + * + * Both document it as read-only. However Intel also document that, + * for backwards compatiblity, the OS should write 0 to it before + * trying to access the current microcode version. + */ + if ( d->arch.cpuid->x86_vendor != X86_VENDOR_INTEL || val != 0 ) + goto gp_fault; + break; + case MSR_AMD_PATCHLOADER: /* * See note on MSR_IA32_UCODE_WRITE below, which may or may not apply diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/psr.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/psr.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/psr.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/psr.c 2019-12-11 14:35:39.000000000 +0000 @@ -1269,6 +1269,17 @@ cos_num = props->cos_num; ASSERT(info->array_len >= index + cos_num); + /* + * Multiple RDT features may co-exist and their COS_MAX may be + * different. So we should prevent one feature to write COS + * register which exceeds its COS_MAX. + */ + if ( cos > feat->cos_max ) + { + index += cos_num; + continue; + } + for ( j = 0; j < cos_num; j++, index++ ) { if ( feat->cos_reg_val[cos * cos_num + j] != info->val[index] ) diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/pv/emul-gate-op.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/pv/emul-gate-op.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/pv/emul-gate-op.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/pv/emul-gate-op.c 2019-12-11 14:35:39.000000000 +0000 @@ -51,7 +51,13 @@ const struct desc_struct *pdesc = gdt_ldt_desc_ptr(gate_sel); if ( (gate_sel < 4) || - ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) || + /* + * We're interested in call gates only, which occupy a single + * seg_desc_t for 32-bit and a consecutive pair of them for 64-bit. + */ + ((gate_sel >> 3) + !is_pv_32bit_vcpu(v) >= + (gate_sel & 4 ? v->arch.pv_vcpu.ldt_ents + : v->arch.pv_vcpu.gdt_ents)) || __get_user(desc, pdesc) ) return 0; @@ -70,7 +76,7 @@ if ( !is_pv_32bit_vcpu(v) ) { if ( (*ar & 0x1f00) != 0x0c00 || - (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) || + /* Limit check done above already. */ __get_user(desc, pdesc + 1) || (desc.b & 0x1f00) ) return 0; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/pv/emul-priv-op.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/pv/emul-priv-op.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/pv/emul-priv-op.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/pv/emul-priv-op.c 2019-12-11 14:35:39.000000000 +0000 @@ -199,7 +199,7 @@ /* AMD extended configuration space access? */ if ( CF8_ADDR_HI(currd->arch.pci_cf8) && boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 <= 0x17 ) + boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 < 0x17 ) { uint64_t msr_val; @@ -912,16 +912,16 @@ *val = 0; return X86EMUL_OKAY; - case MSR_IA32_UCODE_REV: - BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL); - if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) - { - if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) ) - break; - /* As documented in the SDM: Do a CPUID 1 here */ - cpuid_eax(1); - } - goto normal; + case MSR_FAM10H_MMIO_CONF_BASE: + if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD || + boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 >= 0x17 ) + break; + /* fall through */ + case MSR_AMD64_NB_CFG: + if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ) + goto normal; + *val = 0; + return X86EMUL_OKAY; case MSR_IA32_MISC_ENABLE: rdmsrl(reg, *val); @@ -1048,9 +1048,6 @@ break; case MSR_AMD64_NB_CFG: - if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD || - boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 ) - break; if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ) return X86EMUL_OKAY; if ( (rdmsr_safe(MSR_AMD64_NB_CFG, temp) != 0) || @@ -1062,7 +1059,7 @@ case MSR_FAM10H_MMIO_CONF_BASE: if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD || - boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 ) + boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 >= 0x17 ) break; if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ) return X86EMUL_OKAY; @@ -1081,17 +1078,6 @@ return X86EMUL_OKAY; break; - case MSR_IA32_UCODE_REV: - if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) - break; - if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ) - return X86EMUL_OKAY; - if ( rdmsr_safe(reg, temp) ) - break; - if ( val ) - goto invalid; - return X86EMUL_OKAY; - case MSR_IA32_MISC_ENABLE: rdmsrl(reg, temp); if ( val != guest_misc_enable(temp) ) diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/pv/emulate.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/pv/emulate.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/pv/emulate.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/pv/emulate.c 2019-12-11 14:35:39.000000000 +0000 @@ -31,7 +31,14 @@ { struct desc_struct desc; - if ( sel < 4) + if ( sel < 4 || + /* + * Don't apply the GDT limit here, as the selector may be a Xen + * provided one. __get_user() will fail (without taking further + * action) for ones falling in the gap between guest populated + * and Xen ones. + */ + ((sel & 4) && (sel >> 3) >= v->arch.pv_vcpu.ldt_ents) ) desc.b = desc.a = 0; else if ( __get_user(desc, gdt_ldt_desc_ptr(sel)) ) return 0; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/pv/mm.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/pv/mm.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/pv/mm.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/pv/mm.c 2019-12-11 14:35:39.000000000 +0000 @@ -92,12 +92,16 @@ BUG_ON(unlikely(in_irq())); /* - * Hardware limit checking should guarantee this property. NB. This is + * Prior limit checking should guarantee this property. NB. This is * safe as updates to the LDT can only be made by MMUEXT_SET_LDT to the * current vcpu, and vcpu_reset() will block until this vcpu has been * descheduled before continuing. */ - ASSERT((offset >> 3) <= curr->arch.pv_vcpu.ldt_ents); + if ( unlikely((offset >> 3) >= curr->arch.pv_vcpu.ldt_ents) ) + { + ASSERT_UNREACHABLE(); + return false; + } if ( is_pv_32bit_domain(currd) ) linear = (uint32_t)linear; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/pv/shim.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/pv/shim.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/pv/shim.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/pv/shim.c 2019-12-11 14:35:39.000000000 +0000 @@ -470,6 +470,9 @@ else rc = xen_hypercall_event_channel_op(EVTCHNOP_status, &status); + if ( !rc && __copy_to_guest(arg, &status, 1) ) + rc = -EFAULT; + break; } diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/setup.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/setup.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/setup.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/setup.c 2019-12-11 14:35:39.000000000 +0000 @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include @@ -100,8 +99,6 @@ unsigned long __read_mostly xen_virt_end; -DEFINE_PER_CPU(struct tss_struct, init_tss); - char __section(".bss.stack_aligned") __aligned(STACK_SIZE) cpu0_stack[STACK_SIZE]; @@ -671,7 +668,7 @@ unsigned int initrdidx, num_parked = 0; multiboot_info_t *mbi; module_t *mod; - unsigned long nr_pages, raw_max_page, modules_headroom, *module_map; + unsigned long nr_pages, raw_max_page, modules_headroom, module_map[1]; int i, j, e820_warn = 0, bytes = 0; bool acpi_boot_table_init_done = false, relocated = false; struct ns16550_defaults ns16550 = { @@ -821,6 +818,17 @@ if ( !(mbi->flags & MBI_MODULES) || (mbi->mods_count == 0) ) panic("dom0 kernel not specified. Check bootloader configuration."); + /* Check that we don't have a silly number of modules. */ + if ( mbi->mods_count > sizeof(module_map) * 8 ) + { + mbi->mods_count = sizeof(module_map) * 8; + printk("Excessive multiboot modules - using the first %u only\n", + mbi->mods_count); + } + + bitmap_fill(module_map, mbi->mods_count); + __clear_bit(0, module_map); /* Dom0 kernel is always first */ + if ( pvh_boot ) { /* pvh_init() already filled in e820_raw */ @@ -1535,10 +1543,6 @@ init_IRQ(); - module_map = xmalloc_array(unsigned long, BITS_TO_LONGS(mbi->mods_count)); - bitmap_fill(module_map, mbi->mods_count); - __clear_bit(0, module_map); /* Dom0 kernel is always first */ - xsm_multiboot_init(module_map, mbi); microcode_grab_module(module_map, mbi); @@ -1547,6 +1551,8 @@ early_microcode_init(); + tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */ + identify_cpu(&boot_cpu_data); set_in_cr4(X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/smp.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/smp.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/smp.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/smp.c 2019-12-11 14:35:39.000000000 +0000 @@ -302,23 +302,31 @@ */ void smp_send_stop(void) { - int timeout = 10; + unsigned int cpu = smp_processor_id(); - local_irq_disable(); - fixup_irqs(cpumask_of(smp_processor_id()), 0); - local_irq_enable(); - - smp_call_function(stop_this_cpu, NULL, 0); - - /* Wait 10ms for all other CPUs to go offline. */ - while ( (num_online_cpus() > 1) && (timeout-- > 0) ) - mdelay(1); - - local_irq_disable(); - disable_IO_APIC(); - hpet_disable(); - __stop_this_cpu(); - local_irq_enable(); + if ( num_online_cpus() > 1 ) + { + int timeout = 10; + + local_irq_disable(); + fixup_irqs(cpumask_of(cpu), 0); + local_irq_enable(); + + smp_call_function(stop_this_cpu, NULL, 0); + + /* Wait 10ms for all other CPUs to go offline. */ + while ( (num_online_cpus() > 1) && (timeout-- > 0) ) + mdelay(1); + } + + if ( cpu_online(cpu) ) + { + local_irq_disable(); + disable_IO_APIC(); + hpet_disable(); + __stop_this_cpu(); + local_irq_enable(); + } } void smp_send_nmi_allbutself(void) diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/smpboot.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/smpboot.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/smpboot.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/smpboot.c 2019-12-11 14:35:39.000000000 +0000 @@ -376,6 +376,8 @@ if ( boot_cpu_has(X86_FEATURE_IBRSB) ) wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl); + tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */ + if ( xen_guest ) hypervisor_ap_setup(); @@ -829,7 +831,13 @@ if ( !rc ) rc = clone_mapping(idt_tables[cpu], rpt); if ( !rc ) - rc = clone_mapping(&per_cpu(init_tss, cpu), rpt); + { + struct tss_page *ptr = &per_cpu(tss_page, cpu); + + BUILD_BUG_ON(sizeof(*ptr) != PAGE_SIZE); + + rc = clone_mapping(&ptr->tss, rpt); + } if ( !rc ) rc = clone_mapping((void *)per_cpu(stubs.addr, cpu), rpt); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/spec_ctrl.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/spec_ctrl.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/spec_ctrl.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/spec_ctrl.c 2019-12-11 14:35:39.000000000 +0000 @@ -152,6 +152,9 @@ if ( opt_pv_l1tf_domu < 0 ) opt_pv_l1tf_domu = 0; + if ( opt_tsx == -1 ) + opt_tsx = -3; + disable_common: opt_rsb_pv = false; opt_rsb_hvm = false; @@ -362,7 +365,7 @@ printk("Speculative mitigation facilities:\n"); /* Hardware features which pertain to speculative mitigations. */ - printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n", + printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "", (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP" : "", (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "", @@ -374,7 +377,9 @@ (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", (caps & ARCH_CAPS_SKIP_L1DFL) ? " SKIP_L1DFL": "", (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : "", - (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : ""); + (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "", + (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "", + (caps & ARCH_CAPS_TAA_NO) ? " TAA_NO" : ""); /* Compiled-in support which pertains to mitigations. */ if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) ) @@ -388,7 +393,7 @@ "\n"); /* Settings for Xen's protection, irrespective of guests. */ - printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s%s%s\n", + printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s, Other:%s%s%s\n", thunk == THUNK_NONE ? "N/A" : thunk == THUNK_RETPOLINE ? "RETPOLINE" : thunk == THUNK_LFENCE ? "LFENCE" : @@ -397,6 +402,8 @@ (default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-", !boot_cpu_has(X86_FEATURE_SSBD) ? "" : (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-", + !(caps & ARCH_CAPS_TSX_CTRL) ? "" : + (opt_tsx & 1) ? " TSX+" : " TSX-", opt_ibpb ? " IBPB" : "", opt_l1d_flush ? " L1D_FLUSH" : "", opt_md_clear_pv || opt_md_clear_hvm ? " VERW" : ""); @@ -415,6 +422,7 @@ printk(" Support for VMs: PV:%s%s%s%s%s, HVM:%s%s%s%s%s\n", (boot_cpu_has(X86_FEATURE_SC_MSR_PV) || boot_cpu_has(X86_FEATURE_SC_RSB_PV) || + boot_cpu_has(X86_FEATURE_MD_CLEAR) || opt_eager_fpu) ? "" : " None", boot_cpu_has(X86_FEATURE_SC_MSR_PV) ? " MSR_SPEC_CTRL" : "", boot_cpu_has(X86_FEATURE_SC_RSB_PV) ? " RSB" : "", @@ -422,6 +430,7 @@ boot_cpu_has(X86_FEATURE_MD_CLEAR) ? " MD_CLEAR" : "", (boot_cpu_has(X86_FEATURE_SC_MSR_HVM) || boot_cpu_has(X86_FEATURE_SC_RSB_HVM) || + boot_cpu_has(X86_FEATURE_MD_CLEAR) || opt_eager_fpu) ? "" : " None", boot_cpu_has(X86_FEATURE_SC_MSR_HVM) ? " MSR_SPEC_CTRL" : "", boot_cpu_has(X86_FEATURE_SC_RSB_HVM) ? " RSB" : "", @@ -567,9 +576,11 @@ case 0x4d: /* Avaton / Rangely (Silvermont) */ case 0x4c: /* Cherrytrail / Brasswell */ case 0x4a: /* Merrifield */ + case 0x57: /* Knights Landing */ case 0x5a: /* Moorefield */ case 0x5c: /* Goldmont */ case 0x5f: /* Denverton */ + case 0x85: /* Knights Mill */ return true; default: @@ -907,6 +918,7 @@ { enum ind_thunk thunk = THUNK_DEFAULT; bool use_spec_ctrl = false, ibrs = false, hw_smt_enabled; + bool cpu_has_bug_taa; uint64_t caps = 0; if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) ) @@ -1136,6 +1148,53 @@ "enabled. Mitigations will not be fully effective. Please\n" "choose an explicit smt= setting. See XSA-297.\n"); + /* + * Vulnerability to TAA is a little complicated to quantify. + * + * In the pipeline, it is just another way to get speculative access to + * stale load port, store buffer or fill buffer data, and therefore can be + * considered a superset of MDS (on TSX-capable parts). On parts which + * predate MDS_NO, the existing VERW flushing will mitigate this + * sidechannel as well. + * + * On parts which contain MDS_NO, the lack of VERW flushing means that an + * attacker can still use TSX to target microarchitectural buffers to leak + * secrets. Therefore, we consider TAA to be the set of TSX-capable parts + * which have MDS_NO but lack TAA_NO. + * + * Note: cpu_has_rtm (== hle) could already be hidden by `tsx=0` on the + * cmdline. MSR_TSX_CTRL will only appear on TSX-capable parts, so + * we check both to spot TSX in a microcode/cmdline independent way. + */ + cpu_has_bug_taa = + (cpu_has_rtm || (caps & ARCH_CAPS_TSX_CTRL)) && + (caps & (ARCH_CAPS_MDS_NO | ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO; + + /* + * On TAA-affected hardware, disabling TSX is the preferred mitigation, vs + * the MDS mitigation of disabling HT and using VERW flushing. + * + * On CPUs which advertise MDS_NO, VERW has no flushing side effect until + * the TSX_CTRL microcode is loaded, despite the MD_CLEAR CPUID bit being + * advertised, and there isn't a MD_CLEAR_2 flag to use... + * + * If we're on affected hardware, able to do something about it (which + * implies that VERW now works), no explicit TSX choice and traditional + * MDS mitigations (no-SMT, VERW) not obviosuly in use (someone might + * plausibly value TSX higher than Hyperthreading...), disable TSX to + * mitigate TAA. + */ + if ( opt_tsx == -1 && cpu_has_bug_taa && (caps & ARCH_CAPS_TSX_CTRL) && + ((hw_smt_enabled && opt_smt) || + !boot_cpu_has(X86_FEATURE_SC_VERW_IDLE)) ) + { + setup_clear_cpu_cap(X86_FEATURE_HLE); + setup_clear_cpu_cap(X86_FEATURE_RTM); + + opt_tsx = 0; + tsx_init(); + } + print_details(thunk, caps); /* diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/time.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/time.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/time.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/time.c 2019-12-11 14:35:39.000000000 +0000 @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -364,12 +365,41 @@ return hpet_read32(HPET_COUNTER); } -static s64 __init init_hpet(struct platform_timesource *pts) +static int64_t __init init_hpet(struct platform_timesource *pts) { - u64 hpet_rate = hpet_setup(), start; - u32 count, target; + uint64_t hpet_rate, start; + uint32_t count, target; - if ( hpet_rate == 0 ) + if ( hpet_address && strcmp(opt_clocksource, pts->id) && + cpuidle_using_deep_cstate() ) + { + if ( pci_conf_read16(0, 0, 0x1f, 0, + PCI_VENDOR_ID) == PCI_VENDOR_ID_INTEL ) + switch ( pci_conf_read16(0, 0, 0x1f, 0, PCI_DEVICE_ID) ) + { + /* HPET on Bay Trail platforms will halt in deep C states. */ + case 0x0f1c: + /* HPET on Cherry Trail platforms will halt in deep C states. */ + case 0x229c: + hpet_address = 0; + break; + } + + /* + * Some Coffee Lake platforms have a skewed HPET timer once the SoCs + * entered PC10. + */ + if ( pci_conf_read16(0, 0, 0, 0, + PCI_VENDOR_ID) == PCI_VENDOR_ID_INTEL && + pci_conf_read16(0, 0, 0, 0, + PCI_DEVICE_ID) == 0x3ec4 ) + hpet_address = 0; + + if ( !hpet_address ) + printk("Disabling HPET for being unreliable\n"); + } + + if ( (hpet_rate = hpet_setup()) == 0 ) return 0; pts->frequency = hpet_rate; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/traps.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/traps.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/traps.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/traps.c 2019-12-11 14:35:39.000000000 +0000 @@ -106,6 +106,12 @@ /* Pointer to the IDT of every CPU. */ idt_entry_t *idt_tables[NR_CPUS] __read_mostly; +/* + * The TSS is smaller than a page, but we give it a full page to avoid + * adjacent per-cpu data leaking via Meltdown when XPTI is in use. + */ +DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_page, tss_page); + bool (*ioemul_handle_quirk)( u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs); @@ -557,7 +563,7 @@ printk("Valid stack range: %p-%p, sp=%p, tss.rsp0=%p\n", (void *)esp_top, (void *)esp_bottom, (void *)esp, - (void *)per_cpu(init_tss, cpu).rsp0); + (void *)per_cpu(tss_page, cpu).tss.rsp0); /* * Trigger overflow trace if %esp is anywhere within the guard page, or @@ -1917,7 +1923,7 @@ void load_TR(void) { - struct tss_struct *tss = &this_cpu(init_tss); + struct tss64 *tss = &this_cpu(tss_page).tss; struct desc_ptr old_gdt, tss_gdt = { .base = (long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY), .limit = LAST_RESERVED_GDT_BYTE @@ -1925,14 +1931,10 @@ _set_tssldt_desc( this_cpu(gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY, - (unsigned long)tss, - offsetof(struct tss_struct, __cacheline_filler) - 1, - SYS_DESC_tss_avail); + (unsigned long)tss, sizeof(*tss) - 1, SYS_DESC_tss_avail); _set_tssldt_desc( this_cpu(compat_gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY, - (unsigned long)tss, - offsetof(struct tss_struct, __cacheline_filler) - 1, - SYS_DESC_tss_busy); + (unsigned long)tss, sizeof(*tss) - 1, SYS_DESC_tss_busy); /* Switch to non-compat GDT (which has B bit clear) to execute LTR. */ asm volatile ( diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/tsx.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/tsx.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/tsx.c 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/tsx.c 2019-12-11 14:35:39.000000000 +0000 @@ -0,0 +1,75 @@ +#include +#include + +/* + * Valid values: + * 1 => Explicit tsx=1 + * 0 => Explicit tsx=0 + * -1 => Default, implicit tsx=1, may change to 0 to mitigate TAA + * -3 => Implicit tsx=1 (feed-through from spec-ctrl=0) + * + * This is arranged such that the bottom bit encodes whether TSX is actually + * disabled, while identifying various explicit (>=0) and implicit (<0) + * conditions. + */ +int8_t __read_mostly opt_tsx = -1; +int8_t __read_mostly cpu_has_tsx_ctrl = -1; + +static int __init parse_tsx(const char *s) +{ + int rc = 0, val = parse_bool(s, NULL); + + if ( val >= 0 ) + opt_tsx = val; + else + rc = -EINVAL; + + return rc; +} +custom_param("tsx", parse_tsx); + +void tsx_init(void) +{ + /* + * This function is first called between microcode being loaded, and CPUID + * being scanned generally. Calculate from raw data whether MSR_TSX_CTRL + * is available. + */ + if ( unlikely(cpu_has_tsx_ctrl < 0) ) + { + uint64_t caps = 0; + + if ( boot_cpu_data.cpuid_level >= 7 && + (cpuid_count_edx(7, 0) & cpufeat_mask(X86_FEATURE_ARCH_CAPS)) ) + rdmsrl(MSR_ARCH_CAPABILITIES, caps); + + cpu_has_tsx_ctrl = !!(caps & ARCH_CAPS_TSX_CTRL); + } + + if ( cpu_has_tsx_ctrl ) + { + uint64_t val; + + rdmsrl(MSR_TSX_CTRL, val); + + val &= ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR); + /* Check bottom bit only. Higher bits are various sentinals. */ + if ( !(opt_tsx & 1) ) + val |= TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_TSX_CTRL, val); + } + else if ( opt_tsx >= 0 ) + printk_once(XENLOG_WARNING + "MSR_TSX_CTRL not available - Ignoring tsx= setting\n"); +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/x86_64/mm.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/x86_64/mm.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/x86_64/mm.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/x86_64/mm.c 2019-12-11 14:35:39.000000000 +0000 @@ -574,8 +574,9 @@ page_to_mfn(l1_pg), 1UL << (2 * PAGETABLE_ORDER), PAGE_HYPERVISOR); + /* Fill with INVALID_M2P_ENTRY. */ memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)), - 0x77, 1UL << L3_PAGETABLE_SHIFT); + 0xFF, 1UL << L3_PAGETABLE_SHIFT); ASSERT(!l2_table_offset(va)); /* NB. Cannot be GLOBAL: guest user mode should not see it. */ @@ -666,10 +667,10 @@ page_to_mfn(l1_pg), 1UL << PAGETABLE_ORDER, PAGE_HYPERVISOR); + /* Fill with INVALID_M2P_ENTRY. */ memset((void *)(RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)), - 0x55, - 1UL << L2_PAGETABLE_SHIFT); + 0xFF, 1UL << L2_PAGETABLE_SHIFT); /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */ l2e_write(l2_ro_mpt, l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT)); } @@ -1105,7 +1106,7 @@ * 0xf6800000. Extend these to allow access to the larger read-only * M2P table available in 32on64 mode. */ - base = (b & (0xff << 24)) | ((b & 0xff) << 16) | (a >> 16); + base = (b & 0xff000000) | ((b & 0xff) << 16) | (a >> 16); limit = (b & 0xf0000) | (a & 0xffff); limit++; /* We add one because limit is inclusive. */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/x86_emulate/x86_emulate.c xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/x86_emulate/x86_emulate.c --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/x86_emulate/x86_emulate.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/x86_emulate/x86_emulate.c 2019-12-11 14:35:39.000000000 +0000 @@ -3814,10 +3814,12 @@ { /* * xbegin unconditionally aborts, xabort is unconditionally - * a nop. + * a nop. It also does not truncate the destination address to + * 16 bits when 16-bit operand size is in effect. */ if ( b & 1 ) { + op_bytes = 4; jmp_rel((int32_t)src.val); _regs.r(ax) = 0; } diff -Nru xen-4.11.1+92-g6c33308a8d/xen/arch/x86/xen.lds.S xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/xen.lds.S --- xen-4.11.1+92-g6c33308a8d/xen/arch/x86/xen.lds.S 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/arch/x86/xen.lds.S 2019-12-11 14:35:39.000000000 +0000 @@ -277,14 +277,16 @@ __bss_start = .; *(.bss.stack_aligned) *(.bss.page_aligned*) - *(.bss) - . = ALIGN(SMP_CACHE_BYTES); + . = ALIGN(PAGE_SIZE); __per_cpu_start = .; + *(.bss.percpu.page_aligned) *(.bss.percpu) . = ALIGN(SMP_CACHE_BYTES); *(.bss.percpu.read_mostly) . = ALIGN(SMP_CACHE_BYTES); __per_cpu_data_end = .; + *(.bss) + . = ALIGN(POINTER_ALIGN); __bss_end = .; } :text _end = . ; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/common/Kconfig xen-4.11.3+24-g14b62ab3e5/xen/common/Kconfig --- xen-4.11.1+92-g6c33308a8d/xen/common/Kconfig 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/common/Kconfig 2019-12-11 14:35:39.000000000 +0000 @@ -93,7 +93,7 @@ config XSM bool "Xen Security Modules support" - default n + default ARM ---help--- Enables the security framework known as Xen Security Modules which allows administrators fine-grained control over a Xen domain and @@ -105,7 +105,7 @@ config FLASK def_bool y - prompt "FLux Advanced Security Kernel support" if EXPERT = "y" + prompt "FLux Advanced Security Kernel support" depends on XSM ---help--- Enables FLASK (FLux Advanced Security Kernel) as the access control @@ -130,7 +130,7 @@ config XSM_POLICY bool "Compile Xen with a built-in security policy" default y if HAS_CHECKPOLICY = "y" - depends on XSM + depends on FLASK ---help--- This includes a default XSM policy in the hypervisor so that the bootloader does not need to load a policy to get sane behavior from an @@ -143,6 +143,33 @@ If unsure, say Y. +config SILO + def_bool y + prompt "SILO support" + depends on XSM + ---help--- + Enables SILO as the access control mechanism used by the XSM framework. + This is not the default module, add boot parameter xsm=silo to choose + it. This will deny any unmediated communication channels (grant tables + and event channels) between unprivileged VMs. + + If unsure, say Y. + +choice + prompt "Default XSM implementation" + depends on XSM + default XSM_SILO_DEFAULT if SILO && ARM + default XSM_FLASK_DEFAULT if FLASK + default XSM_SILO_DEFAULT if SILO + default XSM_DUMMY_DEFAULT + config XSM_DUMMY_DEFAULT + bool "Match non-XSM behavior" + config XSM_FLASK_DEFAULT + bool "FLux Advanced Security Kernel" if FLASK + config XSM_SILO_DEFAULT + bool "SILO" if SILO +endchoice + config LATE_HWDOM bool "Dedicated hardware domain" default n diff -Nru xen-4.11.1+92-g6c33308a8d/xen/common/compat/domain.c xen-4.11.3+24-g14b62ab3e5/xen/common/compat/domain.c --- xen-4.11.1+92-g6c33308a8d/xen/common/compat/domain.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/common/compat/domain.c 2019-12-11 14:35:39.000000000 +0000 @@ -81,7 +81,7 @@ } if ( rc == -ERESTART ) - rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iuh", + rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih", cmd, vcpuid, arg); break; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/common/domain.c xen-4.11.3+24-g14b62ab3e5/xen/common/domain.c --- xen-4.11.1+92-g6c33308a8d/xen/common/domain.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/common/domain.c 2019-12-11 14:35:39.000000000 +0000 @@ -1186,7 +1186,6 @@ void *mapping; vcpu_info_t *new_info; struct page_info *page; - int i; if ( offset > (PAGE_SIZE - sizeof(vcpu_info_t)) ) return -EINVAL; @@ -1239,10 +1238,13 @@ * Mark everything as being pending just to make sure nothing gets * lost. The domain will get a spurious event, but it can cope. */ - vcpu_info(v, evtchn_upcall_pending) = 1; - for ( i = 0; i < BITS_PER_EVTCHN_WORD(d); i++ ) - set_bit(i, &vcpu_info(v, evtchn_pending_sel)); - arch_evtchn_inject(v); +#ifdef CONFIG_COMPAT + if ( !has_32bit_shinfo(d) ) + write_atomic(&new_info->native.evtchn_pending_sel, ~0); + else +#endif + write_atomic(&vcpu_info(v, evtchn_pending_sel), ~0); + vcpu_mark_events_pending(v); return 0; } @@ -1307,7 +1309,7 @@ rc = arch_initialise_vcpu(v, arg); if ( rc == -ERESTART ) - rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iuh", + rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih", cmd, vcpuid, arg); break; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/common/domctl.c xen-4.11.3+24-g14b62ab3e5/xen/common/domctl.c --- xen-4.11.1+92-g6c33308a8d/xen/common/domctl.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/common/domctl.c 2019-12-11 14:35:39.000000000 +0000 @@ -392,6 +392,16 @@ switch ( op->cmd ) { + case XEN_DOMCTL_assign_device: + case XEN_DOMCTL_deassign_device: + if ( op->domain == DOMID_IO ) + { + d = dom_io; + break; + } + else if ( op->domain == DOMID_INVALID ) + return -ESRCH; + /* fall through */ case XEN_DOMCTL_test_assign_device: if ( op->domain == DOMID_INVALID ) { @@ -413,7 +423,7 @@ if ( !domctl_lock_acquire() ) { - if ( d ) + if ( d && d != dom_io ) rcu_unlock_domain(d); return hypercall_create_continuation( __HYPERVISOR_domctl, "h", u_domctl); @@ -1148,7 +1158,7 @@ domctl_lock_release(); domctl_out_unlock_domonly: - if ( d ) + if ( d && d != dom_io ) rcu_unlock_domain(d); if ( copyback && __copy_to_guest(u_domctl, op, 1) ) diff -Nru xen-4.11.1+92-g6c33308a8d/xen/common/efi/boot.c xen-4.11.3+24-g14b62ab3e5/xen/common/efi/boot.c --- xen-4.11.1+92-g6c33308a8d/xen/common/efi/boot.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/common/efi/boot.c 2019-12-11 14:35:39.000000000 +0000 @@ -986,8 +986,12 @@ EFI_STATUS status; UINTN info_size; - /* Set graphics mode. */ - if ( gop_mode < gop->Mode->MaxMode && gop_mode != gop->Mode->Mode ) + /* + * Set graphics mode to a selected one and reset it if we didn't come + * directly from EFI loader as video settings might have been already modified. + */ + if ( gop_mode < gop->Mode->MaxMode && + (gop_mode != gop->Mode->Mode || !efi_enabled(EFI_LOADER)) ) gop->SetMode(gop, gop_mode); /* Get graphics and frame buffer info. */ @@ -1051,7 +1055,7 @@ return -EINVAL; for ( *pos = 0; !(mask & 1); ++*pos ) mask >>= 1; - for ( *sz = 0; mask & 1; ++sz) + for ( *sz = 0; mask & 1; ++*sz) mask >>= 1; if ( mask ) return -EINVAL; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/common/event_2l.c xen-4.11.3+24-g14b62ab3e5/xen/common/event_2l.c --- xen-4.11.1+92-g6c33308a8d/xen/common/event_2l.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/common/event_2l.c 2019-12-11 14:35:39.000000000 +0000 @@ -13,6 +13,8 @@ #include #include +#include + static void evtchn_2l_set_pending(struct vcpu *v, struct evtchn *evtchn) { struct domain *d = v->domain; @@ -25,12 +27,12 @@ * others may require explicit memory barriers. */ - if ( test_and_set_bit(port, &shared_info(d, evtchn_pending)) ) + if ( guest_test_and_set_bit(d, port, &shared_info(d, evtchn_pending)) ) return; - if ( !test_bit (port, &shared_info(d, evtchn_mask)) && - !test_and_set_bit(port / BITS_PER_EVTCHN_WORD(d), - &vcpu_info(v, evtchn_pending_sel)) ) + if ( !guest_test_bit(d, port, &shared_info(d, evtchn_mask)) && + !guest_test_and_set_bit(d, port / BITS_PER_EVTCHN_WORD(d), + &vcpu_info(v, evtchn_pending_sel)) ) { vcpu_mark_events_pending(v); } @@ -40,7 +42,7 @@ static void evtchn_2l_clear_pending(struct domain *d, struct evtchn *evtchn) { - clear_bit(evtchn->port, &shared_info(d, evtchn_pending)); + guest_clear_bit(d, evtchn->port, &shared_info(d, evtchn_pending)); } static void evtchn_2l_unmask(struct domain *d, struct evtchn *evtchn) @@ -52,10 +54,10 @@ * These operations must happen in strict order. Based on * evtchn_2l_set_pending() above. */ - if ( test_and_clear_bit(port, &shared_info(d, evtchn_mask)) && - test_bit (port, &shared_info(d, evtchn_pending)) && - !test_and_set_bit (port / BITS_PER_EVTCHN_WORD(d), - &vcpu_info(v, evtchn_pending_sel)) ) + if ( guest_test_and_clear_bit(d, port, &shared_info(d, evtchn_mask)) && + guest_test_bit(d, port, &shared_info(d, evtchn_pending)) && + !guest_test_and_set_bit(d, port / BITS_PER_EVTCHN_WORD(d), + &vcpu_info(v, evtchn_pending_sel)) ) { vcpu_mark_events_pending(v); } @@ -66,7 +68,8 @@ unsigned int max_ports = BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d); ASSERT(port < max_ports); - return port < max_ports && test_bit(port, &shared_info(d, evtchn_pending)); + return (port < max_ports && + guest_test_bit(d, port, &shared_info(d, evtchn_pending))); } static bool evtchn_2l_is_masked(const struct domain *d, evtchn_port_t port) @@ -74,7 +77,8 @@ unsigned int max_ports = BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d); ASSERT(port < max_ports); - return port >= max_ports || test_bit(port, &shared_info(d, evtchn_mask)); + return (port >= max_ports || + guest_test_bit(d, port, &shared_info(d, evtchn_mask))); } static void evtchn_2l_print_state(struct domain *d, diff -Nru xen-4.11.1+92-g6c33308a8d/xen/common/event_fifo.c xen-4.11.3+24-g14b62ab3e5/xen/common/event_fifo.c --- xen-4.11.1+92-g6c33308a8d/xen/common/event_fifo.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/common/event_fifo.c 2019-12-11 14:35:39.000000000 +0000 @@ -17,6 +17,8 @@ #include #include +#include + #include static inline event_word_t *evtchn_fifo_word_from_port(const struct domain *d, @@ -50,7 +52,7 @@ * on the wrong VCPU or with an unexpected priority. */ word = evtchn_fifo_word_from_port(d, evtchn->port); - if ( word && test_bit(EVTCHN_FIFO_LINKED, word) ) + if ( word && guest_test_bit(d, EVTCHN_FIFO_LINKED, word) ) gdprintk(XENLOG_WARNING, "domain %d, port %d already on a queue\n", d->domain_id, evtchn->port); } @@ -115,7 +117,7 @@ * We block unmasking by the guest by marking the tail word as BUSY, * therefore, the cmpxchg() may fail at most 4 times. */ -static bool_t evtchn_fifo_set_link(const struct domain *d, event_word_t *word, +static bool_t evtchn_fifo_set_link(struct domain *d, event_word_t *word, uint32_t link) { event_word_t w; @@ -129,7 +131,7 @@ return ret; /* Lock the word to prevent guest unmasking. */ - set_bit(EVTCHN_FIFO_BUSY, word); + guest_set_bit(d, EVTCHN_FIFO_BUSY, word); w = read_atomic(word); @@ -139,13 +141,13 @@ if ( ret >= 0 ) { if ( ret == 0 ) - clear_bit(EVTCHN_FIFO_BUSY, word); + guest_clear_bit(d, EVTCHN_FIFO_BUSY, word); return ret; } } gdprintk(XENLOG_WARNING, "domain %d, port %d not linked\n", d->domain_id, link); - clear_bit(EVTCHN_FIFO_BUSY, word); + guest_clear_bit(d, EVTCHN_FIFO_BUSY, word); return 1; } @@ -170,13 +172,13 @@ return; } - was_pending = test_and_set_bit(EVTCHN_FIFO_PENDING, word); + was_pending = guest_test_and_set_bit(d, EVTCHN_FIFO_PENDING, word); /* * Link the event if it unmasked and not already linked. */ - if ( !test_bit(EVTCHN_FIFO_MASKED, word) - && !test_bit(EVTCHN_FIFO_LINKED, word) ) + if ( !guest_test_bit(d, EVTCHN_FIFO_MASKED, word) && + !guest_test_bit(d, EVTCHN_FIFO_LINKED, word) ) { struct evtchn_fifo_queue *q, *old_q; event_word_t *tail_word; @@ -205,7 +207,7 @@ if ( !old_q ) goto done; - if ( test_and_set_bit(EVTCHN_FIFO_LINKED, word) ) + if ( guest_test_and_set_bit(d, EVTCHN_FIFO_LINKED, word) ) { spin_unlock_irqrestore(&old_q->lock, flags); goto done; @@ -251,8 +253,8 @@ spin_unlock_irqrestore(&q->lock, flags); if ( !linked - && !test_and_set_bit(q->priority, - &v->evtchn_fifo->control_block->ready) ) + && !guest_test_and_set_bit(d, q->priority, + &v->evtchn_fifo->control_block->ready) ) vcpu_mark_events_pending(v); } done: @@ -274,7 +276,7 @@ * No need to unlink as the guest will unlink and ignore * non-pending events. */ - clear_bit(EVTCHN_FIFO_PENDING, word); + guest_clear_bit(d, EVTCHN_FIFO_PENDING, word); } static void evtchn_fifo_unmask(struct domain *d, struct evtchn *evtchn) @@ -286,10 +288,10 @@ if ( unlikely(!word) ) return; - clear_bit(EVTCHN_FIFO_MASKED, word); + guest_clear_bit(d, EVTCHN_FIFO_MASKED, word); /* Relink if pending. */ - if ( test_bit(EVTCHN_FIFO_PENDING, word) ) + if ( guest_test_bit(d, EVTCHN_FIFO_PENDING, word) ) evtchn_fifo_set_pending(v, evtchn); } @@ -297,21 +299,21 @@ { const event_word_t *word = evtchn_fifo_word_from_port(d, port); - return word && test_bit(EVTCHN_FIFO_PENDING, word); + return word && guest_test_bit(d, EVTCHN_FIFO_PENDING, word); } static bool_t evtchn_fifo_is_masked(const struct domain *d, evtchn_port_t port) { const event_word_t *word = evtchn_fifo_word_from_port(d, port); - return !word || test_bit(EVTCHN_FIFO_MASKED, word); + return !word || guest_test_bit(d, EVTCHN_FIFO_MASKED, word); } static bool_t evtchn_fifo_is_busy(const struct domain *d, evtchn_port_t port) { const event_word_t *word = evtchn_fifo_word_from_port(d, port); - return word && test_bit(EVTCHN_FIFO_LINKED, word); + return word && guest_test_bit(d, EVTCHN_FIFO_LINKED, word); } static int evtchn_fifo_set_priority(struct domain *d, struct evtchn *evtchn, @@ -338,11 +340,11 @@ word = evtchn_fifo_word_from_port(d, evtchn->port); if ( !word ) printk("? "); - else if ( test_bit(EVTCHN_FIFO_LINKED, word) ) - printk("%c %-4u", test_bit(EVTCHN_FIFO_BUSY, word) ? 'B' : ' ', + else if ( guest_test_bit(d, EVTCHN_FIFO_LINKED, word) ) + printk("%c %-4u", guest_test_bit(d, EVTCHN_FIFO_BUSY, word) ? 'B' : ' ', *word & EVTCHN_FIFO_LINK_MASK); else - printk("%c - ", test_bit(EVTCHN_FIFO_BUSY, word) ? 'B' : ' '); + printk("%c - ", guest_test_bit(d, EVTCHN_FIFO_BUSY, word) ? 'B' : ' '); } static const struct evtchn_port_ops evtchn_port_ops_fifo = @@ -494,7 +496,7 @@ evtchn = evtchn_from_port(d, port); - if ( test_bit(port, &shared_info(d, evtchn_pending)) ) + if ( guest_test_bit(d, port, &shared_info(d, evtchn_pending)) ) evtchn->pending = 1; evtchn_fifo_set_priority(d, evtchn, EVTCHN_FIFO_PRIORITY_DEFAULT); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/common/grant_table.c xen-4.11.3+24-g14b62ab3e5/xen/common/grant_table.c --- xen-4.11.1+92-g6c33308a8d/xen/common/grant_table.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/common/grant_table.c 2019-12-11 14:35:39.000000000 +0000 @@ -39,6 +39,7 @@ #include #include #include +#include /* Per-domain grant information. */ struct grant_table { @@ -652,11 +653,12 @@ return 0; } -static int _set_status_v1(domid_t domid, +static int _set_status_v1(const grant_entry_header_t *shah, + struct domain *rd, + struct active_grant_entry *act, int readonly, int mapflag, - grant_entry_header_t *shah, - struct active_grant_entry *act) + domid_t ldomid) { int rc = GNTST_okay; union grant_combo scombo, prev_scombo, new_scombo; @@ -691,11 +693,11 @@ if ( !act->pin && (((scombo.shorts.flags & mask) != GTF_permit_access) || - (scombo.shorts.domid != domid)) ) + (scombo.shorts.domid != ldomid)) ) PIN_FAIL(done, GNTST_general_error, "Bad flags (%x) or dom (%d); expected d%d\n", scombo.shorts.flags, scombo.shorts.domid, - domid); + ldomid); new_scombo = scombo; new_scombo.shorts.flags |= GTF_reading; @@ -708,8 +710,8 @@ "Attempt to write-pin a r/o grant entry\n"); } - prev_scombo.word = cmpxchg((u32 *)shah, - scombo.word, new_scombo.word); + prev_scombo.word = guest_cmpxchg(rd, (u32 *)shah, + scombo.word, new_scombo.word); if ( likely(prev_scombo.word == scombo.word) ) break; @@ -724,12 +726,13 @@ return rc; } -static int _set_status_v2(domid_t domid, +static int _set_status_v2(const grant_entry_header_t *shah, + grant_status_t *status, + struct domain *rd, + struct active_grant_entry *act, int readonly, int mapflag, - grant_entry_header_t *shah, - struct active_grant_entry *act, - grant_status_t *status) + domid_t ldomid) { int rc = GNTST_okay; union grant_combo scombo; @@ -755,10 +758,10 @@ if ( !act->pin && ( (((flags & mask) != GTF_permit_access) && ((flags & mask) != GTF_transitive)) || - (id != domid)) ) + (id != ldomid)) ) PIN_FAIL(done, GNTST_general_error, "Bad flags (%x) or dom (%d); expected d%d, flags %x\n", - flags, id, domid, mask); + flags, id, ldomid, mask); if ( readonly ) { @@ -785,21 +788,21 @@ { if ( (((flags & mask) != GTF_permit_access) && ((flags & mask) != GTF_transitive)) || - (id != domid) || + (id != ldomid) || (!readonly && (flags & GTF_readonly)) ) { - gnttab_clear_flag(_GTF_writing, status); - gnttab_clear_flag(_GTF_reading, status); + gnttab_clear_flag(rd, _GTF_writing, status); + gnttab_clear_flag(rd, _GTF_reading, status); PIN_FAIL(done, GNTST_general_error, "Unstable flags (%x) or dom (%d); expected d%d (r/w: %d)\n", - flags, id, domid, !readonly); + flags, id, ldomid, !readonly); } } else { if ( unlikely(flags & GTF_readonly) ) { - gnttab_clear_flag(_GTF_writing, status); + gnttab_clear_flag(rd, _GTF_writing, status); PIN_FAIL(done, GNTST_general_error, "Unstable grant readonly flag\n"); } @@ -810,19 +813,20 @@ } -static int _set_status(unsigned gt_version, - domid_t domid, +static int _set_status(const grant_entry_header_t *shah, + grant_status_t *status, + struct domain *rd, + unsigned rgt_version, + struct active_grant_entry *act, int readonly, int mapflag, - grant_entry_header_t *shah, - struct active_grant_entry *act, - grant_status_t *status) + domid_t ldomid) { - if ( gt_version == 1 ) - return _set_status_v1(domid, readonly, mapflag, shah, act); + if ( rgt_version == 1 ) + return _set_status_v1(shah, rd, act, readonly, mapflag, ldomid); else - return _set_status_v2(domid, readonly, mapflag, shah, act, status); + return _set_status_v2(shah, status, rd, act, readonly, mapflag, ldomid); } static struct active_grant_entry *grant_map_exists(const struct domain *ld, @@ -915,8 +919,6 @@ mfn_t frame; struct page_info *pg = NULL; int rc = GNTST_okay; - u32 old_pin; - u32 act_pin; unsigned int cache_flags, refcnt = 0, typecnt = 0; bool host_map_created = false; struct active_grant_entry *act = NULL; @@ -994,9 +996,9 @@ (!(op->flags & GNTMAP_readonly) && !(act->pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask))) ) { - if ( (rc = _set_status(rgt->gt_version, ld->domain_id, - op->flags & GNTMAP_readonly, - 1, shah, act, status) ) != GNTST_okay ) + if ( (rc = _set_status(shah, status, rd, rgt->gt_version, act, + op->flags & GNTMAP_readonly, 1, + ld->domain_id) != GNTST_okay) ) goto act_release_out; if ( !act->pin ) @@ -1020,7 +1022,6 @@ } } - old_pin = act->pin; if ( op->flags & GNTMAP_device_map ) act->pin += (op->flags & GNTMAP_readonly) ? GNTPIN_devr_inc : GNTPIN_devw_inc; @@ -1029,7 +1030,6 @@ GNTPIN_hstr_inc : GNTPIN_hstw_inc; frame = act->frame; - act_pin = act->pin; cache_flags = (shah->flags & (GTF_PAT | GTF_PWT | GTF_PCD) ); @@ -1137,27 +1137,22 @@ if ( need_iommu ) { unsigned int kind; - int err = 0; double_gt_lock(lgt, rgt); - /* We're not translated, so we know that gmfns and mfns are - the same things, so the IOMMU entry is always 1-to-1. */ + /* + * We're not translated, so we know that dfns and mfns are + * the same things, so the IOMMU entry is always 1-to-1. + */ kind = mapkind(lgt, rd, frame); - if ( (act_pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) && - !(old_pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) ) - { - if ( !(kind & MAPKIND_WRITE) ) - err = iommu_map_page(ld, mfn_x(frame), mfn_x(frame), - IOMMUF_readable|IOMMUF_writable); - } - else if ( act_pin && !old_pin ) - { - if ( !kind ) - err = iommu_map_page(ld, mfn_x(frame), mfn_x(frame), - IOMMUF_readable); - } - if ( err ) + if ( !(op->flags & GNTMAP_readonly) && + !(kind & MAPKIND_WRITE) ) + kind = IOMMUF_readable | IOMMUF_writable; + else if ( !kind ) + kind = IOMMUF_readable; + else + kind = 0; + if ( kind && iommu_map_page(ld, mfn_x(frame), mfn_x(frame), kind) ) { double_gt_unlock(lgt, rgt); rc = GNTST_general_error; @@ -1172,7 +1167,7 @@ * other fields so just ensure the flags field is stored last. * * However, if gnttab_need_iommu_mapping() then this would race - * with a concurrent mapcount() call (on an unmap, for example) + * with a concurrent mapkind() call (on an unmap, for example) * and a lock is required. */ mt = &maptrack_entry(lgt, handle); @@ -1218,10 +1213,10 @@ unlock_out_clear: if ( !(op->flags & GNTMAP_readonly) && !(act->pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) ) - gnttab_clear_flag(_GTF_writing, status); + gnttab_clear_flag(rd, _GTF_writing, status); if ( !act->pin ) - gnttab_clear_flag(_GTF_reading, status); + gnttab_clear_flag(rd, _GTF_reading, status); act_release_out: active_entry_release(act); @@ -1505,10 +1500,10 @@ if ( ((act->pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask)) == 0) && !(op->done & GNTMAP_readonly) ) - gnttab_clear_flag(_GTF_writing, status); + gnttab_clear_flag(rd, _GTF_writing, status); if ( act->pin == 0 ) - gnttab_clear_flag(_GTF_reading, status); + gnttab_clear_flag(rd, _GTF_reading, status); active_entry_release(act); grant_read_unlock(rgt); @@ -2073,8 +2068,8 @@ new_scombo = scombo; new_scombo.shorts.flags |= GTF_transfer_committed; - prev_scombo.word = cmpxchg((u32 *)&sha->flags, - scombo.word, new_scombo.word); + prev_scombo.word = guest_cmpxchg(rd, (u32 *)&sha->flags, + scombo.word, new_scombo.word); if ( likely(prev_scombo.word == scombo.word) ) break; @@ -2359,11 +2354,11 @@ act->pin -= GNTPIN_hstw_inc; if ( !(act->pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask)) ) - gnttab_clear_flag(_GTF_writing, status); + gnttab_clear_flag(rd, _GTF_writing, status); } if ( !act->pin ) - gnttab_clear_flag(_GTF_reading, status); + gnttab_clear_flag(rd, _GTF_reading, status); active_entry_release(act); grant_read_unlock(rgt); @@ -2385,14 +2380,15 @@ under the domain's grant table lock. */ /* Only safe on transitive grants. Even then, note that we don't attempt to drop any pin on the referent grant. */ -static void fixup_status_for_copy_pin(const struct active_grant_entry *act, +static void fixup_status_for_copy_pin(struct domain *rd, + const struct active_grant_entry *act, uint16_t *status) { if ( !(act->pin & (GNTPIN_hstw_mask | GNTPIN_devw_mask)) ) - gnttab_clear_flag(_GTF_writing, status); + gnttab_clear_flag(rd, _GTF_writing, status); if ( !act->pin ) - gnttab_clear_flag(_GTF_reading, status); + gnttab_clear_flag(rd, _GTF_reading, status); } /* Grab a frame number from a grant entry and update the flags and pin @@ -2452,8 +2448,8 @@ { if ( (!old_pin || (!readonly && !(old_pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask)))) && - (rc = _set_status_v2(ldom, readonly, 0, shah, act, - status)) != GNTST_okay ) + (rc = _set_status_v2(shah, status, rd, act, readonly, 0, + ldom)) != GNTST_okay ) goto unlock_out; if ( !allow_transitive ) @@ -2501,7 +2497,7 @@ if ( rc != GNTST_okay ) { - fixup_status_for_copy_pin(act, status); + fixup_status_for_copy_pin(rd, act, status); rcu_unlock_domain(td); active_entry_release(act); grant_read_unlock(rgt); @@ -2524,7 +2520,7 @@ !act->is_sub_page)) ) { release_grant_for_copy(td, trans_gref, readonly); - fixup_status_for_copy_pin(act, status); + fixup_status_for_copy_pin(rd, act, status); rcu_unlock_domain(td); active_entry_release(act); grant_read_unlock(rgt); @@ -2553,9 +2549,8 @@ else if ( !old_pin || (!readonly && !(old_pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask))) ) { - if ( (rc = _set_status(rgt->gt_version, ldom, - readonly, 0, shah, act, - status) ) != GNTST_okay ) + if ( (rc = _set_status(shah, status, rd, rgt->gt_version, act, + readonly, 0, ldom)) != GNTST_okay ) goto unlock_out; td = rd; @@ -2642,10 +2637,10 @@ unlock_out_clear: if ( !(readonly) && !(act->pin & (GNTPIN_hstw_mask | GNTPIN_devw_mask)) ) - gnttab_clear_flag(_GTF_writing, status); + gnttab_clear_flag(rd, _GTF_writing, status); if ( !act->pin ) - gnttab_clear_flag(_GTF_reading, status); + gnttab_clear_flag(rd, _GTF_reading, status); unlock_out: active_entry_release(act); @@ -3713,11 +3708,11 @@ } if ( (act->pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask)) == 0 ) - gnttab_clear_flag(_GTF_writing, status); + gnttab_clear_flag(rd, _GTF_writing, status); } if ( act->pin == 0 ) - gnttab_clear_flag(_GTF_reading, status); + gnttab_clear_flag(rd, _GTF_reading, status); active_entry_release(act); grant_read_unlock(rgt); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/common/livepatch.c xen-4.11.3+24-g14b62ab3e5/xen/common/livepatch.c --- xen-4.11.1+92-g6c33308a8d/xen/common/livepatch.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/common/livepatch.c 2019-12-11 14:35:39.000000000 +0000 @@ -1060,6 +1060,14 @@ unsigned int i; int rc; + rc = arch_livepatch_safety_check(); + if ( rc ) + { + printk(XENLOG_ERR LIVEPATCH "%s: Safety checks failed: %d\n", + data->name, rc); + return rc; + } + printk(XENLOG_INFO LIVEPATCH "%s: Applying %u functions\n", data->name, data->nfuncs); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/common/lz4/decompress.c xen-4.11.3+24-g14b62ab3e5/xen/common/lz4/decompress.c --- xen-4.11.1+92-g6c33308a8d/xen/common/lz4/decompress.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/common/lz4/decompress.c 2019-12-11 14:35:39.000000000 +0000 @@ -132,8 +132,12 @@ /* Error: request to write beyond destination buffer */ if (cpy > oend) goto _output_error; +#if LZ4_ARCH64 + if ((ref + COPYLENGTH) > oend) +#else if ((ref + COPYLENGTH) > oend || (op + COPYLENGTH) > oend) +#endif goto _output_error; LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); while (op < cpy) @@ -147,7 +151,7 @@ goto _output_error; continue; } - if (unlikely((unsigned long)cpy < (unsigned long)op)) + if (unlikely((unsigned long)cpy < (unsigned long)op - (STEPSIZE - 4))) goto _output_error; LZ4_SECURECOPY(ref, op, cpy); op = cpy; /* correction */ @@ -266,7 +270,13 @@ if (cpy > oend - COPYLENGTH) { if (cpy > oend) goto _output_error; /* write outside of buf */ - +#if LZ4_ARCH64 + if ((ref + COPYLENGTH) > oend) +#else + if ((ref + COPYLENGTH) > oend || + (op + COPYLENGTH) > oend) +#endif + goto _output_error; LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); while (op < cpy) *op++ = *ref++; @@ -279,7 +289,7 @@ goto _output_error; continue; } - if (unlikely((unsigned long)cpy < (unsigned long)op)) + if (unlikely((unsigned long)cpy < (unsigned long)op - (STEPSIZE - 4))) goto _output_error; LZ4_SECURECOPY(ref, op, cpy); op = cpy; /* correction */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/common/memory.c xen-4.11.3+24-g14b62ab3e5/xen/common/memory.c --- xen-4.11.1+92-g6c33308a8d/xen/common/memory.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/common/memory.c 2019-12-11 14:35:39.000000000 +0000 @@ -1163,7 +1163,7 @@ * hypercall has failed and only part of the extents where * processed. */ - pv_shim_offline_memory(args.nr_extents, args.nr_done); + pv_shim_offline_memory(args.nr_done, args.extent_order); #endif break; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/common/page_alloc.c xen-4.11.3+24-g14b62ab3e5/xen/common/page_alloc.c --- xen-4.11.1+92-g6c33308a8d/xen/common/page_alloc.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/common/page_alloc.c 2019-12-11 14:35:39.000000000 +0000 @@ -1728,6 +1728,18 @@ unsigned long i; /* + * Keep MFN 0 away from the buddy allocator to avoid crossing zone + * boundary when merging two buddies. + */ + if ( !mfn_x(page_to_mfn(pg)) ) + { + if ( nr_pages-- <= 1 ) + return; + pg++; + } + + + /* * Some pages may not go through the boot allocator (e.g reserved * memory at boot but released just after --- kernel, initramfs, * etc.). diff -Nru xen-4.11.1+92-g6c33308a8d/xen/common/sched_credit2.c xen-4.11.3+24-g14b62ab3e5/xen/common/sched_credit2.c --- xen-4.11.1+92-g6c33308a8d/xen/common/sched_credit2.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/common/sched_credit2.c 2019-12-11 14:35:39.000000000 +0000 @@ -4075,6 +4075,8 @@ prv = csched2_priv(ops); ops->sched_data = NULL; + if ( prv ) + xfree(prv->rqd); xfree(prv); } diff -Nru xen-4.11.1+92-g6c33308a8d/xen/common/schedule.c xen-4.11.3+24-g14b62ab3e5/xen/common/schedule.c --- xen-4.11.1+92-g6c33308a8d/xen/common/schedule.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/common/schedule.c 2019-12-11 14:35:39.000000000 +0000 @@ -337,7 +337,7 @@ if ( vcpu_priv[v->vcpu_id] == NULL ) { for_each_vcpu ( d, v ) - xfree(vcpu_priv[v->vcpu_id]); + SCHED_OP(c->sched, free_vdata, vcpu_priv[v->vcpu_id]); xfree(vcpu_priv); sched_free_domdata(c->sched, domdata); return -ENOMEM; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/amd/iommu_init.c xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/amd/iommu_init.c --- xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/amd/iommu_init.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/amd/iommu_init.c 2019-12-11 14:35:39.000000000 +0000 @@ -1071,13 +1071,12 @@ { list_del(&iommu->list); if ( iommu->enabled ) - { disable_iommu(iommu); - deallocate_ring_buffer(&iommu->cmd_buffer); - deallocate_ring_buffer(&iommu->event_log); - deallocate_ring_buffer(&iommu->ppr_log); - unmap_iommu_mmio_region(iommu); - } + + deallocate_ring_buffer(&iommu->cmd_buffer); + deallocate_ring_buffer(&iommu->event_log); + deallocate_ring_buffer(&iommu->ppr_log); + unmap_iommu_mmio_region(iommu); xfree(iommu); } diff -Nru xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/amd/iommu_intr.c xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/amd/iommu_intr.c --- xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/amd/iommu_intr.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/amd/iommu_intr.c 2019-12-11 14:35:39.000000000 +0000 @@ -22,6 +22,7 @@ #include #include #include +#include #define INTREMAP_TABLE_ORDER 1 #define INTREMAP_LENGTH 0xB @@ -610,6 +611,8 @@ { void *tb = ivrs_mapping->intremap_table; + XFREE(ivrs_mapping->intremap_inuse); + if ( tb ) { __free_amd_iommu_tables(tb, INTREMAP_TABLE_ORDER); @@ -696,6 +699,8 @@ dump_intremap_table(ivrs_mapping->intremap_table); spin_unlock_irqrestore(&(ivrs_mapping->intremap_lock), flags); + process_pending_softirqs(); + return 0; } diff -Nru xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/amd/iommu_map.c xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/amd/iommu_map.c --- xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/amd/iommu_map.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/amd/iommu_map.c 2019-12-11 14:35:39.000000000 +0000 @@ -456,7 +456,7 @@ * page tables. */ static int iommu_pde_from_gfn(struct domain *d, unsigned long pfn, - unsigned long pt_mfn[]) + unsigned long pt_mfn[], bool map) { u64 *pde, *next_table_vaddr; unsigned long next_table_mfn; @@ -470,6 +470,13 @@ BUG_ON( table == NULL || level < IOMMU_PAGING_MODE_LEVEL_1 || level > IOMMU_PAGING_MODE_LEVEL_6 ); + /* + * A frame number past what the current page tables can represent can't + * possibly have a mapping. + */ + if ( pfn >> (PTE_PER_TABLE_SHIFT * level) ) + return 0; + next_table_mfn = mfn_x(page_to_mfn(table)); if ( level == IOMMU_PAGING_MODE_LEVEL_1 ) @@ -530,6 +537,9 @@ /* Install lower level page table for non-present entries */ else if ( !iommu_is_pte_present((u32*)pde) ) { + if ( !map ) + return 0; + if ( next_table_mfn == 0 ) { table = alloc_amd_iommu_pgtable(); @@ -559,97 +569,6 @@ return 0; } -static int update_paging_mode(struct domain *d, unsigned long gfn) -{ - u16 bdf; - void *device_entry; - unsigned int req_id, level, offset; - unsigned long flags; - struct pci_dev *pdev; - struct amd_iommu *iommu = NULL; - struct page_info *new_root = NULL; - struct page_info *old_root = NULL; - void *new_root_vaddr; - unsigned long old_root_mfn; - struct domain_iommu *hd = dom_iommu(d); - - if ( gfn == gfn_x(INVALID_GFN) ) - return -EADDRNOTAVAIL; - ASSERT(!(gfn >> DEFAULT_DOMAIN_ADDRESS_WIDTH)); - - level = hd->arch.paging_mode; - old_root = hd->arch.root_table; - offset = gfn >> (PTE_PER_TABLE_SHIFT * (level - 1)); - - ASSERT(spin_is_locked(&hd->arch.mapping_lock) && is_hvm_domain(d)); - - while ( offset >= PTE_PER_TABLE_SIZE ) - { - /* Allocate and install a new root table. - * Only upper I/O page table grows, no need to fix next level bits */ - new_root = alloc_amd_iommu_pgtable(); - if ( new_root == NULL ) - { - AMD_IOMMU_DEBUG("%s Cannot allocate I/O page table\n", - __func__); - return -ENOMEM; - } - - new_root_vaddr = __map_domain_page(new_root); - old_root_mfn = mfn_x(page_to_mfn(old_root)); - set_iommu_pde_present(new_root_vaddr, old_root_mfn, level, - !!IOMMUF_writable, !!IOMMUF_readable); - level++; - old_root = new_root; - offset >>= PTE_PER_TABLE_SHIFT; - unmap_domain_page(new_root_vaddr); - } - - if ( new_root != NULL ) - { - hd->arch.paging_mode = level; - hd->arch.root_table = new_root; - - if ( !pcidevs_locked() ) - AMD_IOMMU_DEBUG("%s Try to access pdev_list " - "without aquiring pcidevs_lock.\n", __func__); - - /* Update device table entries using new root table and paging mode */ - for_each_pdev( d, pdev ) - { - bdf = PCI_BDF2(pdev->bus, pdev->devfn); - iommu = find_iommu_for_device(pdev->seg, bdf); - if ( !iommu ) - { - AMD_IOMMU_DEBUG("%s Fail to find iommu.\n", __func__); - return -ENODEV; - } - - spin_lock_irqsave(&iommu->lock, flags); - do { - req_id = get_dma_requestor_id(pdev->seg, bdf); - device_entry = iommu->dev_table.buffer + - (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE); - - /* valid = 0 only works for dom0 passthrough mode */ - amd_iommu_set_root_page_table((u32 *)device_entry, - page_to_maddr(hd->arch.root_table), - d->domain_id, - hd->arch.paging_mode, 1); - - amd_iommu_flush_device(iommu, req_id); - bdf += pdev->phantom_stride; - } while ( PCI_DEVFN2(bdf) != pdev->devfn && - PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) ); - spin_unlock_irqrestore(&iommu->lock, flags); - } - - /* For safety, invalidate all entries */ - amd_iommu_flush_all_pages(d); - } - return 0; -} - int amd_iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn, unsigned int flags) { @@ -675,20 +594,7 @@ return rc; } - /* Since HVM domain is initialized with 2 level IO page table, - * we might need a deeper page table for lager gfn now */ - if ( is_hvm_domain(d) ) - { - if ( update_paging_mode(d, gfn) ) - { - spin_unlock(&hd->arch.mapping_lock); - AMD_IOMMU_DEBUG("Update page mode failed gfn = %lx\n", gfn); - domain_crash(d); - return -EFAULT; - } - } - - if ( iommu_pde_from_gfn(d, gfn, pt_mfn) || (pt_mfn[1] == 0) ) + if ( iommu_pde_from_gfn(d, gfn, pt_mfn, true) || (pt_mfn[1] == 0) ) { spin_unlock(&hd->arch.mapping_lock); AMD_IOMMU_DEBUG("Invalid IO pagetable entry gfn = %lx\n", gfn); @@ -767,23 +673,7 @@ return 0; } - /* Since HVM domain is initialized with 2 level IO page table, - * we might need a deeper page table for lager gfn now */ - if ( is_hvm_domain(d) ) - { - int rc = update_paging_mode(d, gfn); - - if ( rc ) - { - spin_unlock(&hd->arch.mapping_lock); - AMD_IOMMU_DEBUG("Update page mode failed gfn = %lx\n", gfn); - if ( rc != -EADDRNOTAVAIL ) - domain_crash(d); - return rc; - } - } - - if ( iommu_pde_from_gfn(d, gfn, pt_mfn) || (pt_mfn[1] == 0) ) + if ( iommu_pde_from_gfn(d, gfn, pt_mfn, false) ) { spin_unlock(&hd->arch.mapping_lock); AMD_IOMMU_DEBUG("Invalid IO pagetable entry gfn = %lx\n", gfn); @@ -791,8 +681,11 @@ return -EFAULT; } - /* mark PTE as 'page not present' */ - clear_iommu_pte_present(pt_mfn[1], gfn); + if ( pt_mfn[1] ) + { + /* Mark PTE as 'page not present'. */ + clear_iommu_pte_present(pt_mfn[1], gfn); + } /* No further merging in amd_iommu_map_page(), as the logic doesn't cope. */ hd->arch.no_merge = true; @@ -848,3 +741,65 @@ mfn_x(pgd_mfn)); } } + +int __init amd_iommu_quarantine_init(struct domain *d) +{ + struct domain_iommu *hd = dom_iommu(d); + unsigned long max_gfn = + PFN_DOWN((1ul << DEFAULT_DOMAIN_ADDRESS_WIDTH) - 1); + unsigned int level = amd_iommu_get_paging_mode(max_gfn); + uint64_t *table; + + if ( hd->arch.root_table ) + { + ASSERT_UNREACHABLE(); + return 0; + } + + spin_lock(&hd->arch.mapping_lock); + + hd->arch.root_table = alloc_amd_iommu_pgtable(); + if ( !hd->arch.root_table ) + goto out; + + table = __map_domain_page(hd->arch.root_table); + while ( level ) + { + struct page_info *pg; + unsigned int i; + + /* + * The pgtable allocator is fine for the leaf page, as well as + * page table pages, and the resulting allocations are always + * zeroed. + */ + pg = alloc_amd_iommu_pgtable(); + if ( !pg ) + break; + + for ( i = 0; i < PTE_PER_TABLE_SIZE; i++ ) + { + uint32_t *pde = (uint32_t *)&table[i]; + + /* + * PDEs are essentially a subset of PTEs, so this function + * is fine to use even at the leaf. + */ + set_iommu_pde_present(pde, mfn_x(page_to_mfn(pg)), level - 1, + false, true); + } + + unmap_domain_page(table); + table = __map_domain_page(pg); + level--; + } + unmap_domain_page(table); + + out: + spin_unlock(&hd->arch.mapping_lock); + + amd_iommu_flush_all_pages(d); + + /* Pages leaked in failure case */ + return level ? -ENOMEM : 0; +} diff -Nru xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/amd/pci_amd_iommu.c xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/amd/pci_amd_iommu.c --- xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/amd/pci_amd_iommu.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/amd/pci_amd_iommu.c 2019-12-11 14:35:39.000000000 +0000 @@ -218,7 +218,7 @@ return rc; } -static int get_paging_mode(unsigned long entries) +int amd_iommu_get_paging_mode(unsigned long entries) { int level = 1; @@ -238,11 +238,18 @@ { struct domain_iommu *hd = dom_iommu(d); - /* For pv and dom0, stick with get_paging_mode(max_page) - * For HVM dom0, use 2 level page table at first */ - hd->arch.paging_mode = is_hvm_domain(d) ? - IOMMU_PAGING_MODE_LEVEL_2 : - get_paging_mode(max_page); + /* + * Choose the number of levels for the IOMMU page tables. + * - PV needs 3 or 4, depending on whether there is RAM (including hotplug + * RAM) above the 512G boundary. + * - HVM could in principle use 3 or 4 depending on how much guest + * physical address space we give it, but this isn't known yet so use 4 + * unilaterally. + */ + hd->arch.paging_mode = is_hvm_domain(d) + ? IOMMU_PAGING_MODE_LEVEL_4 + : amd_iommu_get_paging_mode(get_upper_mfn_bound()); + return 0; } @@ -391,7 +398,7 @@ ivrs_mappings[req_id].read_permission); } - return reassign_device(hardware_domain, d, devfn, pdev); + return reassign_device(pdev->domain, d, devfn, pdev); } static void deallocate_next_page_table(struct page_info *pg, int level) @@ -601,6 +608,7 @@ const struct iommu_ops amd_iommu_ops = { .init = amd_iommu_domain_init, .hwdom_init = amd_iommu_hwdom_init, + .quarantine_init = amd_iommu_quarantine_init, .add_device = amd_iommu_add_device, .remove_device = amd_iommu_remove_device, .assign_device = amd_iommu_assign_device, diff -Nru xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/device_tree.c xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/device_tree.c --- xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/device_tree.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/device_tree.c 2019-12-11 14:35:39.000000000 +0000 @@ -175,6 +175,9 @@ break; } + if ( d == dom_io ) + return -EINVAL; + ret = iommu_assign_dt_device(d, dev); if ( ret ) @@ -200,6 +203,9 @@ ret = xsm_deassign_dtdevice(XSM_HOOK, d, dt_node_full_name(dev)); + if ( d == dom_io ) + return -EINVAL; + ret = iommu_deassign_dt_device(d, dev); if ( ret ) diff -Nru xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/iommu.c xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/iommu.c --- xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/iommu.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/iommu.c 2019-12-11 14:35:39.000000000 +0000 @@ -52,6 +52,7 @@ bool_t __initdata iommu_enable = 1; bool_t __read_mostly iommu_enabled; bool_t __read_mostly force_iommu; +bool __read_mostly iommu_quarantine = true; bool_t __hwdom_initdata iommu_dom0_strict; bool_t __read_mostly iommu_verbose; bool_t __read_mostly iommu_workaround_bios_bug; @@ -99,6 +100,8 @@ else if ( !cmdline_strcmp(s, "force") || !cmdline_strcmp(s, "required") ) force_iommu = val; + else if ( !cmdline_strcmp(s, "quarantine") ) + iommu_quarantine = val; else if ( !cmdline_strcmp(s, "workaround_bios_bug") ) iommu_workaround_bios_bug = val; else if ( !cmdline_strcmp(s, "igfx") ) @@ -219,6 +222,9 @@ { const struct domain_iommu *hd = dom_iommu(d); + if ( d == dom_io ) + return; + d->need_iommu = 0; hd->platform_ops->teardown(d); tasklet_schedule(&iommu_pt_cleanup_tasklet); @@ -229,6 +235,9 @@ if ( need_iommu(d) > 0 ) return 0; + if ( d == dom_io ) + return 0; + if ( !iommu_use_hap_pt(d) ) { int rc; @@ -371,6 +380,21 @@ return rc; } +static int __init iommu_quarantine_init(void) +{ + const struct domain_iommu *hd = dom_iommu(dom_io); + int rc; + + rc = iommu_domain_init(dom_io); + if ( rc ) + return rc; + + if ( !hd->platform_ops->quarantine_init ) + return 0; + + return hd->platform_ops->quarantine_init(dom_io); +} + int __init iommu_setup(void) { int rc = -ENODEV; @@ -404,6 +428,9 @@ printk("I/O virtualisation %sabled\n", iommu_enabled ? "en" : "dis"); if ( iommu_enabled ) { + if ( iommu_quarantine_init() ) + panic("Could not set up quarantine\n"); + printk(" - Dom0 mode: %s\n", iommu_passthrough ? "Passthrough" : iommu_dom0_strict ? "Strict" : "Relaxed"); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/pci.c xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/pci.c --- xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/pci.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/pci.c 2019-12-11 14:35:39.000000000 +0000 @@ -1389,19 +1389,29 @@ return hd->platform_ops->remove_device(pdev->devfn, pci_to_dev(pdev)); } -/* - * If the device isn't owned by the hardware domain, it means it already - * has been assigned to other domain, or it doesn't exist. - */ static int device_assigned(u16 seg, u8 bus, u8 devfn) { struct pci_dev *pdev; + int rc = 0; pcidevs_lock(); - pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn); + + pdev = pci_get_pdev(seg, bus, devfn); + + if ( !pdev ) + rc = -ENODEV; + /* + * If the device exists and it is not owned by either the hardware + * domain or dom_io then it must be assigned to a guest, or be + * hidden (owned by dom_xen). + */ + else if ( pdev->domain != hardware_domain && + pdev->domain != dom_io ) + rc = -EBUSY; + pcidevs_unlock(); - return pdev ? 0 : -EBUSY; + return rc; } static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) @@ -1415,8 +1425,9 @@ /* Prevent device assign if mem paging or mem sharing have been * enabled for this domain */ - if ( unlikely(!need_iommu(d) && - (d->arch.hvm_domain.mem_sharing_enabled || + if ( unlikely(!need_iommu(d) && d != dom_io && + ((is_hvm_domain(d) && + d->arch.hvm_domain.mem_sharing_enabled) || vm_event_check_ring(d->vm_event_paging) || p2m_get_hostp2m(d)->global_logdirty)) ) return -EXDEV; @@ -1431,15 +1442,28 @@ return rc; } - pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn); + pdev = pci_get_pdev(seg, bus, devfn); + + rc = -ENODEV; if ( !pdev ) - { - rc = pci_get_pdev(seg, bus, devfn) ? -EBUSY : -ENODEV; goto done; - } + + rc = 0; + if ( d == pdev->domain ) + goto done; + + rc = -EBUSY; + if ( pdev->domain != hardware_domain && + pdev->domain != dom_io ) + goto done; if ( pdev->msix ) + { + rc = pci_reset_msix_state(pdev); + if ( rc ) + goto done; msixtbl_init(d); + } pdev->fault.count = 0; @@ -1459,6 +1483,10 @@ } done: + /* The device is assigned to dom_io so mark it as quarantined */ + if ( !rc && d == dom_io ) + pdev->quarantine = true; + if ( !has_arch_pdevs(d) && need_iommu(d) ) iommu_teardown(d); pcidevs_unlock(); @@ -1471,6 +1499,7 @@ { const struct domain_iommu *hd = dom_iommu(d); struct pci_dev *pdev = NULL; + struct domain *target; int ret = 0; if ( !iommu_enabled || !hd->platform_ops ) @@ -1481,12 +1510,17 @@ if ( !pdev ) return -ENODEV; + /* De-assignment from dom_io should de-quarantine the device */ + target = ((pdev->quarantine || iommu_quarantine) && + pdev->domain != dom_io) ? + dom_io : hardware_domain; + while ( pdev->phantom_stride ) { devfn += pdev->phantom_stride; if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) ) break; - ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn, + ret = hd->platform_ops->reassign_device(d, target, devfn, pci_to_dev(pdev)); if ( !ret ) continue; @@ -1497,7 +1531,7 @@ } devfn = pdev->devfn; - ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn, + ret = hd->platform_ops->reassign_device(d, target, devfn, pci_to_dev(pdev)); if ( ret ) { @@ -1507,6 +1541,9 @@ return ret; } + if ( pdev->domain == hardware_domain ) + pdev->quarantine = false; + pdev->fault.count = 0; if ( !has_arch_pdevs(d) && need_iommu(d) ) @@ -1685,7 +1722,7 @@ ret = hypercall_create_continuation(__HYPERVISOR_domctl, "h", u_domctl); else if ( ret ) - printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: " + printk(XENLOG_G_ERR "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n", seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d->domain_id, ret); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/vtd/extern.h xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/vtd/extern.h --- xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/vtd/extern.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/vtd/extern.h 2019-12-11 14:35:39.000000000 +0000 @@ -96,6 +96,8 @@ int __must_check me_wifi_quirk(struct domain *domain, u8 bus, u8 devfn, int map); void pci_vtd_quirk(const struct pci_dev *); +void quirk_iommu_caps(struct iommu *iommu); + bool_t platform_supports_intremap(void); bool_t platform_supports_x2apic(void); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/vtd/iommu.c xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/vtd/iommu.c --- xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/vtd/iommu.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/vtd/iommu.c 2019-12-11 14:35:39.000000000 +0000 @@ -192,7 +192,7 @@ nodeid_t node = NUMA_NO_NODE; unsigned int i; - rhsa = drhd_to_rhsa(drhd); + rhsa = drhd ? drhd_to_rhsa(drhd) : NULL; if ( rhsa ) node = pxm_to_node(rhsa->proximity_domain); @@ -1211,6 +1211,8 @@ if ( !(iommu->cap + 1) || !(iommu->ecap + 1) ) return -ENODEV; + quirk_iommu_caps(iommu); + if ( cap_fault_reg_offset(iommu->cap) + cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE || ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE ) @@ -2389,6 +2391,15 @@ if ( ret ) return ret; + if ( devfn == pdev->devfn && pdev->domain != dom_io ) + { + list_move(&pdev->domain_list, &dom_io->arch.pdev_list); + pdev->domain = dom_io; + } + + if ( !has_arch_pdevs(source) ) + vmx_pi_hooks_deassign(source); + if ( !has_arch_pdevs(target) ) vmx_pi_hooks_assign(target); @@ -2401,21 +2412,19 @@ return ret; } - if ( devfn == pdev->devfn ) + if ( devfn == pdev->devfn && pdev->domain != target ) { list_move(&pdev->domain_list, &target->arch.pdev_list); pdev->domain = target; } - if ( !has_arch_pdevs(source) ) - vmx_pi_hooks_deassign(source); - return ret; } static int intel_iommu_assign_device( struct domain *d, u8 devfn, struct pci_dev *pdev, u32 flag) { + struct domain *s = pdev->domain; struct acpi_rmrr_unit *rmrr; int ret = 0, i; u16 bdf, seg; @@ -2458,8 +2467,8 @@ } } - ret = reassign_device_ownership(hardware_domain, d, devfn, pdev); - if ( ret ) + ret = reassign_device_ownership(s, d, devfn, pdev); + if ( ret || d == dom_io ) return ret; /* Setup rmrr identity mapping */ @@ -2472,11 +2481,20 @@ ret = rmrr_identity_mapping(d, 1, rmrr, flag); if ( ret ) { - reassign_device_ownership(d, hardware_domain, devfn, pdev); + int rc; + + rc = reassign_device_ownership(d, s, devfn, pdev); printk(XENLOG_G_ERR VTDPREFIX " cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n", rmrr->base_address, rmrr->end_address, d->domain_id, ret); + if ( rc ) + { + printk(XENLOG_ERR VTDPREFIX + " failed to reclaim %04x:%02x:%02x.%u from %pd (%d)\n", + seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d, rc); + domain_crash(d); + } break; } } @@ -2654,9 +2672,69 @@ vtd_dump_p2m_table_level(hd->arch.pgd_maddr, agaw_to_level(hd->arch.agaw), 0, 0); } +static int __init intel_iommu_quarantine_init(struct domain *d) +{ + struct domain_iommu *hd = dom_iommu(d); + struct dma_pte *parent; + unsigned int agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH); + unsigned int level = agaw_to_level(agaw); + int rc; + + if ( hd->arch.pgd_maddr ) + { + ASSERT_UNREACHABLE(); + return 0; + } + + spin_lock(&hd->arch.mapping_lock); + + hd->arch.pgd_maddr = alloc_pgtable_maddr(NULL, 1); + if ( !hd->arch.pgd_maddr ) + goto out; + + parent = map_vtd_domain_page(hd->arch.pgd_maddr); + while ( level ) + { + uint64_t maddr; + unsigned int offset; + + /* + * The pgtable allocator is fine for the leaf page, as well as + * page table pages, and the resulting allocations are always + * zeroed. + */ + maddr = alloc_pgtable_maddr(NULL, 1); + if ( !maddr ) + break; + + for ( offset = 0; offset < PTE_NUM; offset++ ) + { + struct dma_pte *pte = &parent[offset]; + + dma_set_pte_addr(*pte, maddr); + dma_set_pte_readable(*pte); + } + iommu_flush_cache_page(parent, 1); + + unmap_vtd_domain_page(parent); + parent = map_vtd_domain_page(maddr); + level--; + } + unmap_vtd_domain_page(parent); + + out: + spin_unlock(&hd->arch.mapping_lock); + + rc = iommu_flush_iotlb_all(d); + + /* Pages leaked in failure case */ + return level ? -ENOMEM : rc; +} + const struct iommu_ops intel_iommu_ops = { .init = intel_iommu_domain_init, .hwdom_init = intel_iommu_hwdom_init, + .quarantine_init = intel_iommu_quarantine_init, .add_device = intel_iommu_add_device, .enable_device = intel_iommu_enable_device, .remove_device = intel_iommu_remove_device, diff -Nru xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/vtd/iommu.h xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/vtd/iommu.h --- xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/vtd/iommu.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/vtd/iommu.h 2019-12-11 14:35:39.000000000 +0000 @@ -441,8 +441,7 @@ sdata : 32; }lo; struct { - u64 res_1 : 2, - saddr : 62; + u64 saddr; }hi; }inv_wait_dsc; }q; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/vtd/qinval.c xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/vtd/qinval.c --- xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/vtd/qinval.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/vtd/qinval.c 2019-12-11 14:35:39.000000000 +0000 @@ -147,13 +147,15 @@ u8 iflag, u8 sw, u8 fn, bool_t flush_dev_iotlb) { - volatile u32 poll_slot = QINVAL_STAT_INIT; + static DEFINE_PER_CPU(uint32_t, poll_slot); unsigned int index; unsigned long flags; u64 entry_base; struct qinval_entry *qinval_entry, *qinval_entries; + uint32_t *this_poll_slot = &this_cpu(poll_slot); spin_lock_irqsave(&iommu->register_lock, flags); + ACCESS_ONCE(*this_poll_slot) = QINVAL_STAT_INIT; index = qinval_next_index(iommu); entry_base = iommu_qi_ctrl(iommu)->qinval_maddr + ((index >> QINVAL_ENTRY_ORDER) << PAGE_SHIFT); @@ -166,8 +168,7 @@ qinval_entry->q.inv_wait_dsc.lo.fn = fn; qinval_entry->q.inv_wait_dsc.lo.res_1 = 0; qinval_entry->q.inv_wait_dsc.lo.sdata = QINVAL_STAT_DONE; - qinval_entry->q.inv_wait_dsc.hi.res_1 = 0; - qinval_entry->q.inv_wait_dsc.hi.saddr = virt_to_maddr(&poll_slot) >> 2; + qinval_entry->q.inv_wait_dsc.hi.saddr = virt_to_maddr(this_poll_slot); unmap_vtd_domain_page(qinval_entries); qinval_update_qtail(iommu, index); @@ -182,7 +183,7 @@ timeout = NOW() + MILLISECS(flush_dev_iotlb ? iommu_dev_iotlb_timeout : VTD_QI_TIMEOUT); - while ( poll_slot != QINVAL_STAT_DONE ) + while ( ACCESS_ONCE(*this_poll_slot) != QINVAL_STAT_DONE ) { if ( NOW() > timeout ) { diff -Nru xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/vtd/quirks.c xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/vtd/quirks.c --- xen-4.11.1+92-g6c33308a8d/xen/drivers/passthrough/vtd/quirks.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/drivers/passthrough/vtd/quirks.c 2019-12-11 14:35:39.000000000 +0000 @@ -540,3 +540,28 @@ break; } } + +void __init quirk_iommu_caps(struct iommu *iommu) +{ + /* + * IOMMU Quirks: + * + * SandyBridge IOMMUs claim support for 2M and 1G superpages, but don't + * implement superpages internally. + * + * There are issues changing the walk length under in-flight DMA, which + * has manifested as incompatibility between EPT/IOMMU sharing and the + * workaround for CVE-2018-12207 / XSA-304. Hide the superpages + * capabilities in the IOMMU, which will prevent Xen from sharing the EPT + * and IOMMU pagetables. + * + * Detection of SandyBridge unfortunately has to be done by processor + * model because the client parts don't expose their IOMMUs as PCI devices + * we could match with a Device ID. + */ + if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6 && + (boot_cpu_data.x86_model == 0x2a || + boot_cpu_data.x86_model == 0x2d) ) + iommu->cap &= ~(0xful << 34); +} diff -Nru xen-4.11.1+92-g6c33308a8d/xen/drivers/video/vesa.c xen-4.11.3+24-g14b62ab3e5/xen/drivers/video/vesa.c --- xen-4.11.1+92-g6c33308a8d/xen/drivers/video/vesa.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/drivers/video/vesa.c 2019-12-11 14:35:39.000000000 +0000 @@ -40,6 +40,11 @@ } custom_param("font", parse_font_height); +static inline paddr_t lfb_base(void) +{ + return ((paddr_t)vlfb_info.ext_lfb_base << 32) | vlfb_info.lfb_base; +} + void __init vesa_early_init(void) { unsigned int vram_vmode; @@ -97,15 +102,14 @@ lfbp.text_columns = vlfb_info.width / font->width; lfbp.text_rows = vlfb_info.height / font->height; - lfbp.lfb = lfb = ioremap(vlfb_info.lfb_base, vram_remap); + lfbp.lfb = lfb = ioremap(lfb_base(), vram_remap); if ( !lfb ) return; memset(lfb, 0, vram_remap); - printk(XENLOG_INFO "vesafb: framebuffer at %#x, mapped to 0x%p, " - "using %uk, total %uk\n", - vlfb_info.lfb_base, lfb, + printk(XENLOG_INFO "vesafb: framebuffer at 0x%" PRIpaddr ", mapped to 0x%p, using %uk, total %uk\n", + lfb_base(), lfb, vram_remap >> 10, vram_total >> 10); printk(XENLOG_INFO "vesafb: mode is %dx%dx%u, linelength=%d, font %ux%u\n", vlfb_info.width, vlfb_info.height, @@ -167,7 +171,7 @@ /* Try and find a power of two to add */ do { - rc = mtrr_add(vlfb_info.lfb_base, size_total, type, 1); + rc = mtrr_add(lfb_base(), size_total, type, 1); size_total >>= 1; } while ( (size_total >= PAGE_SIZE) && (rc == -EINVAL) ); } diff -Nru xen-4.11.1+92-g6c33308a8d/xen/drivers/vpci/vpci.c xen-4.11.3+24-g14b62ab3e5/xen/drivers/vpci/vpci.c --- xen-4.11.1+92-g6c33308a8d/xen/drivers/vpci/vpci.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/drivers/vpci/vpci.c 2019-12-11 14:35:39.000000000 +0000 @@ -421,6 +421,7 @@ const struct pci_dev *pdev; const struct vpci_register *r; unsigned int data_offset = 0; + const unsigned long *ro_map = pci_get_ro_map(sbdf.seg); if ( !size ) { @@ -428,6 +429,10 @@ return; } + if ( ro_map && test_bit(sbdf.bdf, ro_map) ) + /* Ignore writes to read-only devices. */ + return; + /* * Find the PCI dev matching the address. * Passthrough everything that's not trapped. diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/arm32/bitops.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/arm32/bitops.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/arm32/bitops.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/arm32/bitops.h 2019-12-11 14:35:39.000000000 +0000 @@ -1,20 +1,6 @@ #ifndef _ARM_ARM32_BITOPS_H #define _ARM_ARM32_BITOPS_H -extern void _set_bit(int nr, volatile void * p); -extern void _clear_bit(int nr, volatile void * p); -extern void _change_bit(int nr, volatile void * p); -extern int _test_and_set_bit(int nr, volatile void * p); -extern int _test_and_clear_bit(int nr, volatile void * p); -extern int _test_and_change_bit(int nr, volatile void * p); - -#define set_bit(n,p) _set_bit(n,p) -#define clear_bit(n,p) _clear_bit(n,p) -#define change_bit(n,p) _change_bit(n,p) -#define test_and_set_bit(n,p) _test_and_set_bit(n,p) -#define test_and_clear_bit(n,p) _test_and_clear_bit(n,p) -#define test_and_change_bit(n,p) _test_and_change_bit(n,p) - #define flsl fls /* diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/arm32/cmpxchg.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/arm32/cmpxchg.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/arm32/cmpxchg.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/arm32/cmpxchg.h 2019-12-11 14:35:39.000000000 +0000 @@ -52,72 +52,70 @@ * indicated by comparing RETURN with OLD. */ -extern void __bad_cmpxchg(volatile void *ptr, int size); +extern unsigned long __bad_cmpxchg(volatile void *ptr, int size); -static always_inline unsigned long __cmpxchg( - volatile void *ptr, unsigned long old, unsigned long new, int size) -{ - unsigned long oldval, res; +#define __CMPXCHG_CASE(sz, name) \ +static inline bool __cmpxchg_case_##name(volatile void *ptr, \ + unsigned long *old, \ + unsigned long new, \ + bool timeout, \ + unsigned int max_try) \ +{ \ + unsigned long oldval; \ + unsigned long res; \ + \ + do { \ + asm volatile("@ __cmpxchg_case_" #name "\n" \ + " ldrex" #sz " %1, [%2]\n" \ + " mov %0, #0\n" \ + " teq %1, %3\n" \ + " strex" #sz "eq %0, %4, [%2]\n" \ + : "=&r" (res), "=&r" (oldval) \ + : "r" (ptr), "Ir" (*old), "r" (new) \ + : "memory", "cc"); \ + \ + if (!res) \ + break; \ + } while (!timeout || ((--max_try) > 0)); \ + \ + *old = oldval; \ + \ + return !res; \ +} +__CMPXCHG_CASE(b, 1) +__CMPXCHG_CASE(h, 2) +__CMPXCHG_CASE( , 4) + +static always_inline bool __int_cmpxchg(volatile void *ptr, unsigned long *old, + unsigned long new, int size, + bool timeout, unsigned int max_try) +{ prefetchw((const void *)ptr); switch (size) { case 1: - do { - asm volatile("@ __cmpxchg1\n" - " ldrexb %1, [%2]\n" - " mov %0, #0\n" - " teq %1, %3\n" - " strexbeq %0, %4, [%2]\n" - : "=&r" (res), "=&r" (oldval) - : "r" (ptr), "Ir" (old), "r" (new) - : "memory", "cc"); - } while (res); - break; + return __cmpxchg_case_1(ptr, old, new, timeout, max_try); case 2: - do { - asm volatile("@ __cmpxchg2\n" - " ldrexh %1, [%2]\n" - " mov %0, #0\n" - " teq %1, %3\n" - " strexheq %0, %4, [%2]\n" - : "=&r" (res), "=&r" (oldval) - : "r" (ptr), "Ir" (old), "r" (new) - : "memory", "cc"); - } while (res); - break; + return __cmpxchg_case_2(ptr, old, new, timeout, max_try); case 4: - do { - asm volatile("@ __cmpxchg4\n" - " ldrex %1, [%2]\n" - " mov %0, #0\n" - " teq %1, %3\n" - " strexeq %0, %4, [%2]\n" - : "=&r" (res), "=&r" (oldval) - : "r" (ptr), "Ir" (old), "r" (new) - : "memory", "cc"); - } while (res); - break; -#if 0 - case 8: - do { - asm volatile("@ __cmpxchg8\n" - " ldrexd %1, [%2]\n" - " mov %0, #0\n" - " teq %1, %3\n" - " strexdeq %0, %4, [%2]\n" - : "=&r" (res), "=&r" (oldval) - : "r" (ptr), "Ir" (old), "r" (new) - : "memory", "cc"); - } while (res); - break; -#endif + return __cmpxchg_case_4(ptr, old, new, timeout, max_try); default: - __bad_cmpxchg(ptr, size); - oldval = 0; + return __bad_cmpxchg(ptr, size); } - return oldval; + ASSERT_UNREACHABLE(); +} + +static always_inline unsigned long __cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, + int size) +{ + if (!__int_cmpxchg(ptr, &old, new, size, false, 0)) + ASSERT_UNREACHABLE(); + + return old; } static always_inline unsigned long __cmpxchg_mb(volatile void *ptr, @@ -133,6 +131,25 @@ return ret; } +/* + * The helper may fail to update the memory if the action takes too long. + * + * @old: On call the value pointed contains the expected old value. It will be + * updated to the actual old value. + * @max_try: Maximum number of iterations + * + * The helper will return true when the update has succeeded (i.e no + * timeout) and false if the update has failed. + */ +static always_inline bool __cmpxchg_mb_timeout(volatile void *ptr, + unsigned long *old, + unsigned long new, + int size, + unsigned int max_try) +{ + return __int_cmpxchg(ptr, old, new, size, true, max_try); +} + #define cmpxchg(ptr,o,n) \ ((__typeof__(*(ptr)))__cmpxchg_mb((ptr), \ (unsigned long)(o), \ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/arm64/bitops.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/arm64/bitops.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/arm64/bitops.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/arm64/bitops.h 2019-12-11 14:35:39.000000000 +0000 @@ -1,16 +1,6 @@ #ifndef _ARM_ARM64_BITOPS_H #define _ARM_ARM64_BITOPS_H -/* - * Little endian assembly atomic bitops. - */ -extern void set_bit(int nr, volatile void *p); -extern void clear_bit(int nr, volatile void *p); -extern void change_bit(int nr, volatile void *p); -extern int test_and_set_bit(int nr, volatile void *p); -extern int test_and_clear_bit(int nr, volatile void *p); -extern int test_and_change_bit(int nr, volatile void *p); - /* Based on linux/include/asm-generic/bitops/builtin-__ffs.h */ /** * __ffs - find first bit in word. diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/arm64/cmpxchg.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/arm64/cmpxchg.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/arm64/cmpxchg.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/arm64/cmpxchg.h 2019-12-11 14:35:39.000000000 +0000 @@ -61,84 +61,79 @@ __ret; \ }) -extern void __bad_cmpxchg(volatile void *ptr, int size); +extern unsigned long __bad_cmpxchg(volatile void *ptr, int size); -static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, - unsigned long new, int size) -{ - unsigned long oldval = 0, res; +#define __CMPXCHG_CASE(w, sz, name) \ +static inline bool __cmpxchg_case_##name(volatile void *ptr, \ + unsigned long *old, \ + unsigned long new, \ + bool timeout, \ + unsigned int max_try) \ +{ \ + unsigned long oldval; \ + unsigned long res; \ + \ + do { \ + asm volatile("// __cmpxchg_case_" #name "\n" \ + " ldxr" #sz " %" #w "1, %2\n" \ + " mov %w0, #0\n" \ + " cmp %" #w "1, %" #w "3\n" \ + " b.ne 1f\n" \ + " stxr" #sz " %w0, %" #w "4, %2\n" \ + "1:\n" \ + : "=&r" (res), "=&r" (oldval), \ + "+Q" (*(unsigned long *)ptr) \ + : "Ir" (*old), "r" (new) \ + : "cc"); \ + \ + if (!res) \ + break; \ + } while (!timeout || ((--max_try) > 0)); \ + \ + *old = oldval; \ + \ + return !res; \ +} +__CMPXCHG_CASE(w, b, 1) +__CMPXCHG_CASE(w, h, 2) +__CMPXCHG_CASE(w, , 4) +__CMPXCHG_CASE( , , 8) + +static always_inline bool __int_cmpxchg(volatile void *ptr, unsigned long *old, + unsigned long new, int size, + bool timeout, unsigned int max_try) +{ switch (size) { case 1: - do { - asm volatile("// __cmpxchg1\n" - " ldxrb %w1, %2\n" - " mov %w0, #0\n" - " cmp %w1, %w3\n" - " b.ne 1f\n" - " stxrb %w0, %w4, %2\n" - "1:\n" - : "=&r" (res), "=&r" (oldval), "+Q" (*(u8 *)ptr) - : "Ir" (old), "r" (new) - : "cc"); - } while (res); - break; - + return __cmpxchg_case_1(ptr, old, new, timeout, max_try); case 2: - do { - asm volatile("// __cmpxchg2\n" - " ldxrh %w1, %2\n" - " mov %w0, #0\n" - " cmp %w1, %w3\n" - " b.ne 1f\n" - " stxrh %w0, %w4, %2\n" - "1:\n" - : "=&r" (res), "=&r" (oldval), "+Q" (*(u16 *)ptr) - : "Ir" (old), "r" (new) - : "cc"); - } while (res); - break; - + return __cmpxchg_case_2(ptr, old, new, timeout, max_try); case 4: - do { - asm volatile("// __cmpxchg4\n" - " ldxr %w1, %2\n" - " mov %w0, #0\n" - " cmp %w1, %w3\n" - " b.ne 1f\n" - " stxr %w0, %w4, %2\n" - "1:\n" - : "=&r" (res), "=&r" (oldval), "+Q" (*(u32 *)ptr) - : "Ir" (old), "r" (new) - : "cc"); - } while (res); - break; - + return __cmpxchg_case_4(ptr, old, new, timeout, max_try); case 8: - do { - asm volatile("// __cmpxchg8\n" - " ldxr %1, %2\n" - " mov %w0, #0\n" - " cmp %1, %3\n" - " b.ne 1f\n" - " stxr %w0, %4, %2\n" - "1:\n" - : "=&r" (res), "=&r" (oldval), "+Q" (*(u64 *)ptr) - : "Ir" (old), "r" (new) - : "cc"); - } while (res); - break; - + return __cmpxchg_case_8(ptr, old, new, timeout, max_try); default: - __bad_cmpxchg(ptr, size); - oldval = 0; + return __bad_cmpxchg(ptr, size); } - return oldval; + ASSERT_UNREACHABLE(); } -static inline unsigned long __cmpxchg_mb(volatile void *ptr, unsigned long old, - unsigned long new, int size) +static always_inline unsigned long __cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, + int size) +{ + if (!__int_cmpxchg(ptr, &old, new, size, false, 0)) + ASSERT_UNREACHABLE(); + + return old; +} + +static always_inline unsigned long __cmpxchg_mb(volatile void *ptr, + unsigned long old, + unsigned long new, int size) { unsigned long ret; @@ -149,6 +144,25 @@ return ret; } +/* + * The helper may fail to update the memory if the action takes too long. + * + * @old: On call the value pointed contains the expected old value. It will be + * updated to the actual old value. + * @max_try: Maximum number of iterations + * + * The helper will return true when the update has succeeded (i.e no + * timeout) and false if the update has failed. + */ +static always_inline bool __cmpxchg_mb_timeout(volatile void *ptr, + unsigned long *old, + unsigned long new, + int size, + unsigned int max_try) +{ + return __int_cmpxchg(ptr, old, new, size, true, max_try); +} + #define cmpxchg(ptr, o, n) \ ({ \ __typeof__(*(ptr)) __ret; \ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/atomic.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/atomic.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/atomic.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/atomic.h 2019-12-11 14:35:39.000000000 +0000 @@ -55,6 +55,19 @@ #if defined (CONFIG_ARM_64) build_atomic_read(read_u64_atomic, "", "", uint64_t, "=r") build_atomic_write(write_u64_atomic, "", "", uint64_t, "r") +#elif defined (CONFIG_ARM_32) +static inline uint64_t read_u64_atomic(const volatile uint64_t *addr) +{ + uint64_t val; + + asm volatile ( "ldrd %0,%H0,%1" : "=r" (val) : "m" (*addr) ); + + return val; +} +static inline void write_u64_atomic(volatile uint64_t *addr, uint64_t val) +{ + asm volatile ( "strd %1,%H1,%0" : "=m" (*addr) : "r" (val) ); +} #endif build_add_sized(add_u8_sized, "b", BYTE, uint8_t, "ri") @@ -69,6 +82,7 @@ case 1: __x = (typeof(*p))read_u8_atomic((uint8_t *)p); break; \ case 2: __x = (typeof(*p))read_u16_atomic((uint16_t *)p); break; \ case 4: __x = (typeof(*p))read_u32_atomic((uint32_t *)p); break; \ + case 8: __x = (typeof(*p))read_u64_atomic((uint64_t *)p); break; \ default: __x = 0; __bad_atomic_size(); break; \ } \ __x; \ @@ -80,6 +94,7 @@ case 1: write_u8_atomic((uint8_t *)p, (uint8_t)__x); break; \ case 2: write_u16_atomic((uint16_t *)p, (uint16_t)__x); break; \ case 4: write_u32_atomic((uint32_t *)p, (uint32_t)__x); break; \ + case 8: write_u64_atomic((uint64_t *)p, (uint64_t)__x); break; \ default: __bad_atomic_size(); break; \ } \ __x; \ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/bitops.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/bitops.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/bitops.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/bitops.h 2019-12-11 14:35:39.000000000 +0000 @@ -38,6 +38,44 @@ # error "unknown ARM variant" #endif +/* + * Atomic bitops + * + * The helpers below *should* only be used on memory shared between + * trusted threads or we know the memory cannot be accessed by another + * thread. + */ + +void set_bit(int nr, volatile void *p); +void clear_bit(int nr, volatile void *p); +void change_bit(int nr, volatile void *p); +int test_and_set_bit(int nr, volatile void *p); +int test_and_clear_bit(int nr, volatile void *p); +int test_and_change_bit(int nr, volatile void *p); + +void clear_mask16(uint16_t mask, volatile void *p); + +/* + * The helpers below may fail to update the memory if the action takes + * too long. + * + * @max_try: Maximum number of iterations + * + * The helpers will return true when the update has succeeded (i.e no + * timeout) and false if the update has failed. + */ +bool set_bit_timeout(int nr, volatile void *p, unsigned int max_try); +bool clear_bit_timeout(int nr, volatile void *p, unsigned int max_try); +bool change_bit_timeout(int nr, volatile void *p, unsigned int max_try); +bool test_and_set_bit_timeout(int nr, volatile void *p, + int *oldbit, unsigned int max_try); +bool test_and_clear_bit_timeout(int nr, volatile void *p, + int *oldbit, unsigned int max_try); +bool test_and_change_bit_timeout(int nr, volatile void *p, + int *oldbit, unsigned int max_try); +bool clear_mask16_timeout(uint16_t mask, volatile void *p, + unsigned int max_try); + /** * __test_and_set_bit - Set a bit and return its old value * @nr: Bit to set diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/domain.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/domain.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/domain.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/domain.h 2019-12-11 14:35:39.000000000 +0000 @@ -163,7 +163,8 @@ #endif /* Control Registers */ - uint32_t actlr, sctlr; + register_t sctlr; + uint32_t actlr; uint32_t cpacr; uint32_t contextidr; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/grant_table.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/grant_table.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/grant_table.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/grant_table.h 2019-12-11 14:35:39.000000000 +0000 @@ -14,7 +14,7 @@ gfn_t *status_gfn; }; -void gnttab_clear_flag(unsigned long nr, uint16_t *addr); +void gnttab_clear_flag(struct domain *d, unsigned long nr, uint16_t *addr); int create_grant_host_mapping(unsigned long gpaddr, mfn_t mfn, unsigned int flags, unsigned int cache_flags); #define gnttab_host_mapping_get_page_type(ro, ld, rd) (0) diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/guest_atomics.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/guest_atomics.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/guest_atomics.h 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/guest_atomics.h 2019-12-11 14:35:39.000000000 +0000 @@ -0,0 +1,126 @@ +#ifndef _ARM_GUEST_ATOMICS_H +#define _ARM_GUEST_ATOMICS_H + +#include +#include + +/* + * The guest atomics helpers shares the same logic. We first try to use + * the *_timeout version of the operation. If it didn't timeout, then we + * successfully updated the memory. Nothing else to do. + * + * If it did timeout, then it means we didn't manage to update the + * memory. This is possibly because the guest is misbehaving (i.e tight + * store loop) but can also happen for other reasons (i.e nested Xen). + * In that case pause the domain and retry the operation, this time + * without a timeout. + * + * Note, those helpers rely on other part of the code to prevent sharing + * a page between Xen and multiple domain. + */ + +DECLARE_PER_CPU(unsigned int, guest_safe_atomic_max); + +#define guest_bitop(name) \ +static inline void guest_##name(struct domain *d, int nr, volatile void *p) \ +{ \ + perfc_incr(atomics_guest); \ + \ + if ( name##_timeout(nr, p, this_cpu(guest_safe_atomic_max)) ) \ + return; \ + \ + perfc_incr(atomics_guest_paused); \ + \ + domain_pause_nosync(d); \ + name(nr, p); \ + domain_unpause(d); \ +} + +#define guest_testop(name) \ +static inline int guest_##name(struct domain *d, int nr, volatile void *p) \ +{ \ + bool succeed; \ + int oldbit; \ + \ + perfc_incr(atomics_guest); \ + \ + succeed = name##_timeout(nr, p, &oldbit, \ + this_cpu(guest_safe_atomic_max)); \ + if ( succeed ) \ + return oldbit; \ + \ + perfc_incr(atomics_guest_paused); \ + \ + domain_pause_nosync(d); \ + oldbit = name(nr, p); \ + domain_unpause(d); \ + \ + return oldbit; \ +} + +guest_bitop(set_bit) +guest_bitop(clear_bit) +guest_bitop(change_bit) + +#undef guest_bitop + +/* test_bit does not use load-store atomic operations */ +#define guest_test_bit(d, nr, p) ((void)(d), test_bit(nr, p)) + +guest_testop(test_and_set_bit) +guest_testop(test_and_clear_bit) +guest_testop(test_and_change_bit) + +#undef guest_testop + +static inline void guest_clear_mask16(struct domain *d, uint16_t mask, + volatile uint16_t *p) +{ + perfc_incr(atomics_guest); + + if ( clear_mask16_timeout(mask, p, this_cpu(guest_safe_atomic_max)) ) + return; + + domain_pause_nosync(d); + clear_mask16(mask, p); + domain_unpause(d); +} + +static inline unsigned long __guest_cmpxchg(struct domain *d, + volatile void *ptr, + unsigned long old, + unsigned long new, + unsigned int size) +{ + unsigned long oldval = old; + + perfc_incr(atomics_guest); + + if ( __cmpxchg_mb_timeout(ptr, &oldval, new, size, + this_cpu(guest_safe_atomic_max)) ) + return oldval; + + perfc_incr(atomics_guest_paused); + + domain_pause_nosync(d); + oldval = __cmpxchg_mb(ptr, old, new, size); + domain_unpause(d); + + return oldval; +} + +#define guest_cmpxchg(d, ptr, o, n) \ + ((__typeof__(*(ptr)))__guest_cmpxchg(d, ptr, \ + (unsigned long)(o),\ + (unsigned long)(n),\ + sizeof (*(ptr)))) + +#endif /* _ARM_GUEST_ATOMICS_H */ +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/mm.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/mm.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/mm.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/mm.h 2019-12-11 14:35:39.000000000 +0000 @@ -135,6 +135,7 @@ extern vaddr_t xenheap_virt_end; #ifdef CONFIG_ARM_64 extern vaddr_t xenheap_virt_start; +extern unsigned long xenheap_base_pdx; #endif #ifdef CONFIG_ARM_32 @@ -152,7 +153,7 @@ #define is_xen_fixed_mfn(mfn) \ ((pfn_to_paddr(mfn) >= virt_to_maddr(&_start)) && \ - (pfn_to_paddr(mfn) <= virt_to_maddr(&_end))) + (pfn_to_paddr(mfn) <= virt_to_maddr((vaddr_t)_end - 1))) #define page_get_owner(_p) (_p)->v.inuse.domain #define page_set_owner(_p,_d) ((_p)->v.inuse.domain = (_d)) @@ -253,9 +254,10 @@ #else static inline void *maddr_to_virt(paddr_t ma) { - ASSERT(pfn_to_pdx(ma >> PAGE_SHIFT) < (DIRECTMAP_SIZE >> PAGE_SHIFT)); + ASSERT((pfn_to_pdx(ma >> PAGE_SHIFT) - xenheap_base_pdx) < + (DIRECTMAP_SIZE >> PAGE_SHIFT)); return (void *)(XENHEAP_VIRT_START - - mfn_to_maddr(xenheap_mfn_start) + + (xenheap_base_pdx << PAGE_SHIFT) + ((ma & ma_va_bottom_mask) | ((ma & ma_top_mask) >> pfn_pdx_hole_shift))); } diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/p2m.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/p2m.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/p2m.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/p2m.h 2019-12-11 14:35:39.000000000 +0000 @@ -38,10 +38,7 @@ /* Current Translation Table Base Register for the p2m */ uint64_t vttbr; - /* - * Highest guest frame that's ever been mapped in the p2m - * Only takes into account ram and foreign mapping - */ + /* Highest guest frame that's ever been mapped in the p2m */ gfn_t max_mapped_gfn; /* diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/percpu.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/percpu.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/percpu.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/percpu.h 2019-12-11 14:35:39.000000000 +0000 @@ -16,10 +16,8 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; void percpu_init_areas(void); -/* Separate out the type, so (int[3], foo) works. */ -#define __DEFINE_PER_CPU(type, name, suffix) \ - __section(".bss.percpu" #suffix) \ - __typeof__(type) per_cpu_##name +#define __DEFINE_PER_CPU(attr, type, name) \ + attr __typeof__(type) per_cpu_ ## name #define per_cpu(var, cpu) \ (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu])) diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/perfc_defn.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/perfc_defn.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/perfc_defn.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/perfc_defn.h 2019-12-11 14:35:39.000000000 +0000 @@ -73,6 +73,9 @@ PERFCOUNTER(virt_timer_irqs, "Virtual timer interrupts") PERFCOUNTER(maintenance_irqs, "Maintenance interrupts") +PERFCOUNTER(atomics_guest, "atomics: guest access") +PERFCOUNTER(atomics_guest_paused, "atomics: guest paused") + /*#endif*/ /* __XEN_PERFC_DEFN_H__ */ /* diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/time.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/time.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-arm/time.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-arm/time.h 2019-12-11 14:35:39.000000000 +0000 @@ -1,15 +1,18 @@ #ifndef __ARM_TIME_H__ #define __ARM_TIME_H__ +#include + #define DT_MATCH_TIMER \ DT_MATCH_COMPATIBLE("arm,armv7-timer"), \ DT_MATCH_COMPATIBLE("arm,armv8-timer") -typedef unsigned long cycles_t; +typedef uint64_t cycles_t; static inline cycles_t get_cycles (void) { - return 0; + isb(); + return READ_SYSREG64(CNTPCT_EL0); } /* List of timer's IRQ */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/bitops.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/bitops.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/bitops.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/bitops.h 2019-12-11 14:35:39.000000000 +0000 @@ -358,7 +358,7 @@ const unsigned long *a__ = (addr); \ unsigned int s__ = (size); \ unsigned int o__ = (off); \ - if ( __builtin_constant_p(size) && !s__ ) \ + if ( o__ >= s__ ) \ r__ = s__; \ else if ( __builtin_constant_p(size) && s__ <= BITS_PER_LONG ) \ r__ = o__ + __scanbit(*(const unsigned long *)(a__) >> o__, s__); \ @@ -390,7 +390,7 @@ const unsigned long *a__ = (addr); \ unsigned int s__ = (size); \ unsigned int o__ = (off); \ - if ( __builtin_constant_p(size) && !s__ ) \ + if ( o__ >= s__ ) \ r__ = s__; \ else if ( __builtin_constant_p(size) && s__ <= BITS_PER_LONG ) \ r__ = o__ + __scanbit(~*(const unsigned long *)(a__) >> o__, s__); \ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/domain.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/domain.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/domain.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/domain.h 2019-12-11 14:35:39.000000000 +0000 @@ -309,7 +309,7 @@ struct paging_domain paging; struct p2m_domain *p2m; - /* To enforce lock ordering in the pod code wrt the + /* To enforce lock ordering in the pod code wrt the * page_alloc lock */ int page_alloc_unlock_level; @@ -542,6 +542,8 @@ struct page_info *old_guest_table; /* partially destructed pagetable */ struct page_info *old_guest_ptpg; /* containing page table of the */ /* former, if any */ + bool old_guest_table_partial; /* Are we dropping a type ref, or just + * finishing up a partial de-validation? */ /* guest_table holds a ref to the page, and also a type-count unless * shadow refcounts are in use */ pagetable_t shadow_table[4]; /* (MFN) shadow(s) of guest */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/grant_table.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/grant_table.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/grant_table.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/grant_table.h 2019-12-11 14:35:39.000000000 +0000 @@ -82,7 +82,8 @@ #define gnttab_mark_dirty(d, f) paging_mark_dirty((d), f) -static inline void gnttab_clear_flag(unsigned int nr, uint16_t *st) +static inline void gnttab_clear_flag(struct domain *d, unsigned int nr, + uint16_t *st) { /* * Note that this cannot be clear_bit(), as the access must be diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/guest_atomics.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/guest_atomics.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/guest_atomics.h 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/guest_atomics.h 2019-12-11 14:35:39.000000000 +0000 @@ -0,0 +1,32 @@ +#ifndef _X86_GUEST_ATOMICS_H +#define _X86_GUEST_ATOMICS_H + +#include + +/* + * It is safe to use the atomics helpers on x86 on memory shared with + * the guests. + */ +#define guest_set_bit(d, nr, p) ((void)(d), set_bit(nr, p)) +#define guest_clear_bit(d, nr, p) ((void)(d), clear_bit(nr, p)) +#define guest_change_bit(d, nr, p) ((void)(d), change_bit(nr, p)) +#define guest_test_bit(d, nr, p) ((void)(d), test_bit(nr, p)) + +#define guest_test_and_set_bit(d, nr, p) \ + ((void)(d), test_and_set_bit(nr, p)) +#define guest_test_and_clear_bit(d, nr, p) \ + ((void)(d), test_and_clear_bit(nr, p)) +#define guest_test_and_change_bit(d, nr, p) \ + ((void)(d), test_and_change_bit(nr, p)) + +#define guest_cmpxchg(d, ptr, o, n) ((void)(d), cmpxchg(ptr, o, n)) + +#endif /* _X86_GUEST_ATOMICS_H */ +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/hvm/hvm.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/hvm/hvm.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/hvm/hvm.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/hvm/hvm.h 2019-12-11 14:35:39.000000000 +0000 @@ -209,7 +209,6 @@ bool_t access_w, bool_t access_x); void (*enable_msr_interception)(struct domain *d, uint32_t msr); - void (*set_icebp_interception)(struct domain *d, bool enable); bool_t (*is_singlestep_supported)(void); /* Alternate p2m */ @@ -484,7 +483,7 @@ enum hvm_task_switch_reason { TSW_jmp, TSW_iret, TSW_call_or_int }; void hvm_task_switch( uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason, - int32_t errcode); + int32_t errcode, unsigned int insn_len); enum hvm_access_type { hvm_access_insn_fetch, @@ -596,16 +595,6 @@ return 0; } -static inline bool hvm_set_icebp_interception(struct domain *d, bool enable) -{ - if ( hvm_funcs.set_icebp_interception ) - { - hvm_funcs.set_icebp_interception(d, enable); - return true; - } - return false; -} - static inline bool_t hvm_is_singlestep_supported(void) { return (hvm_funcs.is_singlestep_supported && diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h 2019-12-11 14:35:39.000000000 +0000 @@ -51,6 +51,9 @@ int amd_iommu_init(void); int amd_iommu_update_ivrs_mapping_acpi(void); +int amd_iommu_get_paging_mode(unsigned long entries); +int amd_iommu_quarantine_init(struct domain *d); + /* mapping functions */ int __must_check amd_iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn, unsigned int flags); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/hvm/svm/emulate.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/hvm/svm/emulate.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/hvm/svm/emulate.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/hvm/svm/emulate.h 2019-12-11 14:35:39.000000000 +0000 @@ -54,6 +54,8 @@ return __get_instruction_length_from_list(v, &instr, 1); } +unsigned int svm_get_task_switch_insn_len(void); + #endif /* __ASM_X86_HVM_SVM_EMULATE_H__ */ /* diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/hvm/svm/svmdebug.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/hvm/svm/svmdebug.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/hvm/svm/svmdebug.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/hvm/svm/svmdebug.h 2019-12-11 14:35:39.000000000 +0000 @@ -22,6 +22,7 @@ #include #include +void svm_sync_vmcb(struct vcpu *v, enum vmcb_sync_state new_state); void svm_vmcb_dump(const char *from, const struct vmcb_struct *vmcb); bool svm_vmcb_isvalid(const char *from, const struct vmcb_struct *vmcb, const struct vcpu *v, bool verbose); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/hvm/vlapic.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/hvm/vlapic.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/hvm/vlapic.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/hvm/vlapic.h 2019-12-11 14:35:39.000000000 +0000 @@ -145,4 +145,10 @@ const struct vlapic *target, const struct vlapic *source, int short_hand, uint32_t dest, bool_t dest_mode); +static inline void vlapic_sync_pir_to_irr(struct vcpu *v) +{ + if ( hvm_funcs.sync_pir_to_irr ) + hvm_funcs.sync_pir_to_irr(v); +} + #endif /* __ASM_X86_HVM_VLAPIC_H__ */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/hvm/vmx/vmcs.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/hvm/vmx/vmcs.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/hvm/vmx/vmcs.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/hvm/vmx/vmcs.h 2019-12-11 14:35:39.000000000 +0000 @@ -63,6 +63,12 @@ unsigned long apic_access_mfn; /* VMX_DOMAIN_* */ unsigned int status; + + /* + * Domain permitted to use Executable EPT Superpages? Cleared to work + * around CVE-2018-12207 as appropriate. + */ + bool exec_sp; }; /* diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/hvm/vmx/vmx.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/hvm/vmx/vmx.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/hvm/vmx/vmx.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/hvm/vmx/vmx.h 2019-12-11 14:35:39.000000000 +0000 @@ -28,6 +28,8 @@ #include #include +extern int8_t opt_ept_exec_sp; + typedef union { struct { u64 r : 1, /* bit 0 - Read permission */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/mm.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/mm.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/mm.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/mm.h 2019-12-11 14:35:39.000000000 +0000 @@ -228,19 +228,34 @@ * setting the flag must not drop that reference, whereas the instance * clearing it will have to. * - * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has - * been partially validated. This implies that the general reference - * to the page (acquired from get_page_from_lNe()) would be dropped - * (again due to the apparent failure) and hence must be re-acquired - * when resuming the validation, but must not be dropped when picking - * up the page for invalidation. - * - * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has - * been partially invalidated. This is basically the opposite case of - * above, i.e. the general reference to the page was not dropped in - * put_page_from_lNe() (due to the apparent failure), and hence it - * must be dropped when the put operation is resumed (and completes), - * but it must not be acquired if picking up the page for validation. + * If partial_flags & PTF_partial_set is set, then the page at + * at @nr_validated_ptes had PGT_partial set as a result of an + * operation on the current page. (That page may or may not + * still have PGT_partial set.) + * + * Additionally, if PTF_partial_set is set, then the PTE at + * @nr_validated_ptef holds a general reference count for the + * page. + * + * This happens: + * - During validation or de-validation, if the operation was + * interrupted + * - During validation, if an invalid entry is encountered and + * validation is preemptible + * - During validation, if PTF_partial_set was set on this + * entry to begin with (perhaps because it picked up a + * previous operation) + * + * When resuming validation, if PTF_partial_set is clear, then + * a general reference must be re-acquired; if it is set, no + * reference should be acquired. + * + * When resuming de-validation, if PTF_partial_set is clear, + * no reference should be dropped; if it is set, a reference + * should be dropped. + * + * NB that PTF_partial_set is defined in mm.c, the only place + * where it is used. * * The 3rd field, @linear_pt_count, indicates * - by a positive value, how many same-level page table entries a page @@ -250,8 +265,8 @@ */ struct { u16 nr_validated_ptes:PAGETABLE_ORDER + 1; - u16 :16 - PAGETABLE_ORDER - 1 - 2; - s16 partial_pte:2; + u16 :16 - PAGETABLE_ORDER - 1 - 1; + u16 partial_flags:1; s16 linear_pt_count; }; @@ -280,8 +295,8 @@ #define is_xen_heap_mfn(mfn) \ (__mfn_valid(mfn) && is_xen_heap_page(mfn_to_page(_mfn(mfn)))) #define is_xen_fixed_mfn(mfn) \ - ((((mfn) << PAGE_SHIFT) >= __pa(&_stext)) && \ - (((mfn) << PAGE_SHIFT) <= __pa(&__2M_rwdata_end))) + ((((mfn) << PAGE_SHIFT) >= __pa(_stext)) && \ + (((mfn) << PAGE_SHIFT) <= __pa(__2M_rwdata_end - 1))) #define PRtype_info "016lx"/* should only be used for printk's */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/msi.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/msi.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/msi.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/msi.h 2019-12-11 14:35:39.000000000 +0000 @@ -92,6 +92,7 @@ extern void teardown_msi_irq(int irq); extern int msi_free_vector(struct msi_desc *entry); extern int pci_restore_msi_state(struct pci_dev *pdev); +extern int pci_reset_msix_state(struct pci_dev *pdev); struct msi_desc { struct msi_attrib { diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/msr-index.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/msr-index.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/msr-index.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/msr-index.h 2019-12-11 14:35:39.000000000 +0000 @@ -54,6 +54,9 @@ #define ARCH_CAPS_SKIP_L1DFL (_AC(1, ULL) << 3) #define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4) #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5) +#define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6) +#define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7) +#define ARCH_CAPS_TAA_NO (_AC(1, ULL) << 8) #define MSR_FLUSH_CMD 0x0000010b #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) @@ -61,6 +64,10 @@ #define MSR_TSX_FORCE_ABORT 0x0000010f #define TSX_FORCE_ABORT_RTM (_AC(1, ULL) << 0) +#define MSR_TSX_CTRL 0x00000122 +#define TSX_CTRL_RTM_DISABLE (_AC(1, ULL) << 0) +#define TSX_CTRL_CPUID_CLEAR (_AC(1, ULL) << 1) + /* Intel MSRs. Some also available on other CPUs */ #define MSR_IA32_PERFCTR0 0x000000c1 #define MSR_IA32_A_PERFCTR0 0x000004c1 diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/percpu.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/percpu.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/percpu.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/percpu.h 2019-12-11 14:35:39.000000000 +0000 @@ -7,10 +7,8 @@ void percpu_init_areas(void); #endif -/* Separate out the type, so (int[3], foo) works. */ -#define __DEFINE_PER_CPU(type, name, suffix) \ - __section(".bss.percpu" #suffix) \ - __typeof__(type) per_cpu_##name +#define __DEFINE_PER_CPU(attr, type, name) \ + attr __typeof__(type) per_cpu_ ## name /* var is in discarded region: offset to particular copy we want */ #define per_cpu(var, cpu) \ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/processor.h xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/processor.h --- xen-4.11.1+92-g6c33308a8d/xen/include/asm-x86/processor.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/asm-x86/processor.h 2019-12-11 14:35:39.000000000 +0000 @@ -258,6 +258,16 @@ return ebx; } +static always_inline unsigned int cpuid_count_edx( + unsigned int leaf, unsigned int subleaf) +{ + unsigned int edx, tmp; + + cpuid_count(leaf, subleaf, &tmp, &tmp, &tmp, &edx); + + return edx; +} + static always_inline void cpuid_count_leaf(uint32_t leaf, uint32_t subleaf, struct cpuid_leaf *data) { @@ -417,7 +427,7 @@ #define IOBMP_BYTES 8192 #define IOBMP_INVALID_OFFSET 0x8000 -struct __packed __cacheline_aligned tss_struct { +struct __packed tss64 { uint32_t :32; uint64_t rsp0, rsp1, rsp2; uint64_t :64; @@ -428,9 +438,11 @@ uint64_t ist[7]; uint64_t :64; uint16_t :16, bitmap; - /* Pads the TSS to be cacheline-aligned (total size is 0x80). */ - uint8_t __cacheline_filler[24]; }; +struct tss_page { + struct tss64 __aligned(PAGE_SIZE) tss; +}; +DECLARE_PER_CPU(struct tss_page, tss_page); #define IST_NONE 0UL #define IST_DF 1UL @@ -469,7 +481,6 @@ extern idt_entry_t idt_table[]; extern idt_entry_t *idt_tables[]; -DECLARE_PER_CPU(struct tss_struct, init_tss); DECLARE_PER_CPU(root_pgentry_t *, root_pgt); extern void write_ptbase(struct vcpu *v); @@ -609,6 +620,9 @@ return fam; } +extern int8_t opt_tsx, cpu_has_tsx_ctrl; +void tsx_init(void); + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_X86_PROCESSOR_H */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/public/arch-arm.h xen-4.11.3+24-g14b62ab3e5/xen/include/public/arch-arm.h --- xen-4.11.1+92-g6c33308a8d/xen/include/public/arch-arm.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/public/arch-arm.h 2019-12-11 14:35:39.000000000 +0000 @@ -291,7 +291,7 @@ struct vcpu_guest_core_regs user_regs; /* Core CPU registers */ - uint32_t sctlr; + uint64_t sctlr; uint64_t ttbcr, ttbr0, ttbr1; }; typedef struct vcpu_guest_context vcpu_guest_context_t; @@ -374,7 +374,7 @@ #define PSR_GUEST32_INIT (PSR_ABT_MASK|PSR_FIQ_MASK|PSR_IRQ_MASK|PSR_MODE_SVC) #define PSR_GUEST64_INIT (PSR_ABT_MASK|PSR_FIQ_MASK|PSR_IRQ_MASK|PSR_MODE_EL1h) -#define SCTLR_GUEST_INIT 0x00c50078 +#define SCTLR_GUEST_INIT xen_mk_ullong(0x00c50078) /* * Virtual machine platform (memory layout, interrupts) diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/public/domctl.h xen-4.11.3+24-g14b62ab3e5/xen/include/public/domctl.h --- xen-4.11.1+92-g6c33308a8d/xen/include/public/domctl.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/public/domctl.h 2019-12-11 14:35:39.000000000 +0000 @@ -138,6 +138,10 @@ #define XEN_DOMCTL_PFINFO_LTAB_MASK (0xfU<<28) /* XEN_DOMCTL_getpageframeinfo3 */ +/* + * Both value `num' and `array' may get modified by the hypercall to allow + * preemption. + */ struct xen_domctl_getpageframeinfo3 { /* IN variables. */ uint64_aligned_t num; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/public/xen-compat.h xen-4.11.3+24-g14b62ab3e5/xen/include/public/xen-compat.h --- xen-4.11.1+92-g6c33308a8d/xen/include/public/xen-compat.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/public/xen-compat.h 2019-12-11 14:35:39.000000000 +0000 @@ -27,7 +27,7 @@ #ifndef __XEN_PUBLIC_XEN_COMPAT_H__ #define __XEN_PUBLIC_XEN_COMPAT_H__ -#define __XEN_LATEST_INTERFACE_VERSION__ 0x00040900 +#define __XEN_LATEST_INTERFACE_VERSION__ 0x00040901 #if defined(__XEN__) || defined(__XEN_TOOLS__) /* Xen is built with matching headers and implements the latest interface. */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/public/xen.h xen-4.11.3+24-g14b62ab3e5/xen/include/public/xen.h --- xen-4.11.1+92-g6c33308a8d/xen/include/public/xen.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/public/xen.h 2019-12-11 14:35:39.000000000 +0000 @@ -922,6 +922,12 @@ uint32_t gbl_caps; /* Mode attributes (offset 0x0, VESA command 0x4f01). */ uint16_t mode_attrs; + uint16_t pad; +#endif +#if __XEN_INTERFACE_VERSION__ >= 0x00040901 && \ + __XEN_INTERFACE_VERSION__ != 0x00040a00 + /* high 32 bits of lfb_base */ + uint32_t ext_lfb_base; #endif } vesa_lfb; } u; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/xen/cpuidle.h xen-4.11.3+24-g14b62ab3e5/xen/include/xen/cpuidle.h --- xen-4.11.1+92-g6c33308a8d/xen/include/xen/cpuidle.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/xen/cpuidle.h 2019-12-11 14:35:39.000000000 +0000 @@ -29,7 +29,7 @@ #include #include -#define ACPI_PROCESSOR_MAX_POWER 8 +#define ACPI_PROCESSOR_MAX_POWER 12 #define CPUIDLE_NAME_LEN 16 #define ACPI_CSTATE_EM_NONE 0 diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/xen/event.h xen-4.11.3+24-g14b62ab3e5/xen/include/xen/event.h --- xen-4.11.1+92-g6c33308a8d/xen/include/xen/event.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/xen/event.h 2019-12-11 14:35:39.000000000 +0000 @@ -83,9 +83,6 @@ /* Notify remote end of a Xen-attached event channel.*/ void notify_via_xen_event_channel(struct domain *ld, int lport); -/* Inject an event channel notification into the guest */ -void arch_evtchn_inject(struct vcpu *v); - /* * Internal event channel object storage. * diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/xen/iommu.h xen-4.11.3+24-g14b62ab3e5/xen/include/xen/iommu.h --- xen-4.11.1+92-g6c33308a8d/xen/include/xen/iommu.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/xen/iommu.h 2019-12-11 14:35:39.000000000 +0000 @@ -29,7 +29,7 @@ #include extern bool_t iommu_enable, iommu_enabled; -extern bool_t force_iommu, iommu_verbose; +extern bool force_iommu, iommu_quarantine, iommu_verbose; extern bool_t iommu_workaround_bios_bug, iommu_igfx, iommu_passthrough; extern bool_t iommu_snoop, iommu_qinval, iommu_intremap, iommu_intpost; extern bool_t iommu_hap_pt_share; @@ -139,6 +139,7 @@ struct iommu_ops { int (*init)(struct domain *d); void (*hwdom_init)(struct domain *d); + int (*quarantine_init)(struct domain *d); int (*add_device)(u8 devfn, device_t *dev); int (*enable_device)(device_t *dev); int (*remove_device)(u8 devfn, device_t *dev); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/xen/lib.h xen-4.11.3+24-g14b62ab3e5/xen/include/xen/lib.h --- xen-4.11.1+92-g6c33308a8d/xen/include/xen/lib.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/xen/lib.h 2019-12-11 14:35:39.000000000 +0000 @@ -116,6 +116,16 @@ #define gprintk(lvl, fmt, args...) \ printk(XENLOG_GUEST lvl "%pv " fmt, current, ## args) +#define printk_once(fmt, args...) \ +({ \ + static bool __read_mostly once_; \ + if ( unlikely(!once_) ) \ + { \ + once_ = true; \ + printk(fmt, ## args); \ + } \ +}) + #ifdef NDEBUG static inline void diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/xen/livepatch.h xen-4.11.3+24-g14b62ab3e5/xen/include/xen/livepatch.h --- xen-4.11.1+92-g6c33308a8d/xen/include/xen/livepatch.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/xen/livepatch.h 2019-12-11 14:35:39.000000000 +0000 @@ -103,6 +103,7 @@ * These functions are called around the critical region patching live code, * for an architecture to take make appropratie global state adjustments. */ +int arch_livepatch_safety_check(void); int arch_livepatch_quiesce(void); void arch_livepatch_revive(void); diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/xen/pci.h xen-4.11.3+24-g14b62ab3e5/xen/include/xen/pci.h --- xen-4.11.1+92-g6c33308a8d/xen/include/xen/pci.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/xen/pci.h 2019-12-11 14:35:39.000000000 +0000 @@ -88,6 +88,9 @@ nodeid_t node; /* NUMA node */ + /* Device to be quarantined, don't automatically re-assign to dom0 */ + bool quarantine; + enum pdev_type { DEV_TYPE_PCI_UNKNOWN, DEV_TYPE_PCIe_ENDPOINT, diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/xen/percpu.h xen-4.11.3+24-g14b62ab3e5/xen/include/xen/percpu.h --- xen-4.11.1+92-g6c33308a8d/xen/include/xen/percpu.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/xen/percpu.h 2019-12-11 14:35:39.000000000 +0000 @@ -9,9 +9,17 @@ * The _##name concatenation is being used here to prevent 'name' from getting * macro expanded, while still allowing a per-architecture symbol name prefix. */ -#define DEFINE_PER_CPU(type, name) __DEFINE_PER_CPU(type, _##name, ) +#define DEFINE_PER_CPU(type, name) \ + __DEFINE_PER_CPU(__section(".bss.percpu"), type, _ ## name) + +#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ + typedef char name ## _chk_t \ + [BUILD_BUG_ON_ZERO(__alignof(type) & (PAGE_SIZE - 1))]; \ + __DEFINE_PER_CPU(__section(".bss.percpu.page_aligned"), \ + type, _ ## name) + #define DEFINE_PER_CPU_READ_MOSTLY(type, name) \ - __DEFINE_PER_CPU(type, _##name, .read_mostly) + __DEFINE_PER_CPU(__section(".bss.percpu.read_mostly"), type, _ ## name) /* Preferred on Xen. Also see arch-defined per_cpu(). */ #define this_cpu(var) __get_cpu_var(var) diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/xsm/dummy.h xen-4.11.3+24-g14b62ab3e5/xen/include/xsm/dummy.h --- xen-4.11.1+92-g6c33308a8d/xen/include/xsm/dummy.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/xsm/dummy.h 2019-12-11 14:35:39.000000000 +0000 @@ -48,7 +48,8 @@ * There is no xsm_default_t argument available, so the value from the assertion * is used to initialize the variable. */ -#define XSM_INLINE /* */ +#define XSM_INLINE __maybe_unused + #define XSM_DEFAULT_ARG /* */ #define XSM_DEFAULT_VOID void #define XSM_ASSERT_ACTION(def) xsm_default_t action = def; (void)action diff -Nru xen-4.11.1+92-g6c33308a8d/xen/include/xsm/xsm.h xen-4.11.3+24-g14b62ab3e5/xen/include/xsm/xsm.h --- xen-4.11.1+92-g6c33308a8d/xen/include/xsm/xsm.h 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/include/xsm/xsm.h 2019-12-11 14:35:39.000000000 +0000 @@ -710,6 +710,11 @@ #endif #ifdef CONFIG_HAS_DEVICE_TREE +/* + * Initialize XSM + * + * On success, return 1 if using SILO mode else 0. + */ extern int xsm_dt_init(void); extern int xsm_dt_policy_init(void **policy_buffer, size_t *policy_size); extern bool has_xsm_magic(paddr_t); @@ -733,6 +738,12 @@ extern const unsigned int xsm_init_policy_size; #endif +#ifdef CONFIG_SILO +extern void silo_init(void); +#else +static inline void silo_init(void) {} +#endif + #else /* CONFIG_XSM */ #include diff -Nru xen-4.11.1+92-g6c33308a8d/xen/xsm/Makefile xen-4.11.3+24-g14b62ab3e5/xen/xsm/Makefile --- xen-4.11.1+92-g6c33308a8d/xen/xsm/Makefile 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/xsm/Makefile 2019-12-11 14:35:39.000000000 +0000 @@ -1,5 +1,6 @@ obj-y += xsm_core.o obj-$(CONFIG_XSM) += xsm_policy.o obj-$(CONFIG_XSM) += dummy.o +obj-$(CONFIG_SILO) += silo.o subdir-$(CONFIG_FLASK) += flask diff -Nru xen-4.11.1+92-g6c33308a8d/xen/xsm/dummy.c xen-4.11.3+24-g14b62ab3e5/xen/xsm/dummy.c --- xen-4.11.1+92-g6c33308a8d/xen/xsm/dummy.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/xsm/dummy.c 2019-12-11 14:35:39.000000000 +0000 @@ -11,7 +11,6 @@ */ #define XSM_NO_WRAPPERS -#define XSM_INLINE /* */ #include struct xsm_operations dummy_xsm_ops; diff -Nru xen-4.11.1+92-g6c33308a8d/xen/xsm/silo.c xen-4.11.3+24-g14b62ab3e5/xen/xsm/silo.c --- xen-4.11.1+92-g6c33308a8d/xen/xsm/silo.c 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/xsm/silo.c 2019-12-11 14:35:39.000000000 +0000 @@ -0,0 +1,108 @@ +/****************************************************************************** + * xsm/silo.c + * + * SILO module for XSM (Xen Security Modules) + * + * Copyright (c) 2018 Citrix Systems Ltd. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; If not, see . + */ +#define XSM_NO_WRAPPERS +#include + +/* + * Check if inter-domain communication is allowed. + * Return true when pass check. + */ +static bool silo_mode_dom_check(const struct domain *ldom, + const struct domain *rdom) +{ + const struct domain *currd = current->domain; + + return (is_control_domain(currd) || is_control_domain(ldom) || + is_control_domain(rdom) || ldom == rdom); +} + +static int silo_evtchn_unbound(struct domain *d1, struct evtchn *chn, + domid_t id2) +{ + int rc = -EPERM; + struct domain *d2 = rcu_lock_domain_by_any_id(id2); + + if ( d2 == NULL ) + rc = -ESRCH; + else + { + if ( silo_mode_dom_check(d1, d2) ) + rc = xsm_evtchn_unbound(d1, chn, id2); + rcu_unlock_domain(d2); + } + + return rc; +} + +static int silo_evtchn_interdomain(struct domain *d1, struct evtchn *chan1, + struct domain *d2, struct evtchn *chan2) +{ + if ( silo_mode_dom_check(d1, d2) ) + return xsm_evtchn_interdomain(d1, chan1, d2, chan2); + return -EPERM; +} + +static int silo_grant_mapref(struct domain *d1, struct domain *d2, + uint32_t flags) +{ + if ( silo_mode_dom_check(d1, d2) ) + return xsm_grant_mapref(d1, d2, flags); + return -EPERM; +} + +static int silo_grant_transfer(struct domain *d1, struct domain *d2) +{ + if ( silo_mode_dom_check(d1, d2) ) + return xsm_grant_transfer(d1, d2); + return -EPERM; +} + +static int silo_grant_copy(struct domain *d1, struct domain *d2) +{ + if ( silo_mode_dom_check(d1, d2) ) + return xsm_grant_copy(d1, d2); + return -EPERM; +} + +static struct xsm_operations silo_xsm_ops = { + .evtchn_unbound = silo_evtchn_unbound, + .evtchn_interdomain = silo_evtchn_interdomain, + .grant_mapref = silo_grant_mapref, + .grant_transfer = silo_grant_transfer, + .grant_copy = silo_grant_copy, +}; + +void __init silo_init(void) +{ + printk("Initialising XSM SILO mode\n"); + + if ( register_xsm(&silo_xsm_ops) ) + panic("SILO: Unable to register with XSM\n"); +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -Nru xen-4.11.1+92-g6c33308a8d/xen/xsm/xsm_core.c xen-4.11.3+24-g14b62ab3e5/xen/xsm/xsm_core.c --- xen-4.11.1+92-g6c33308a8d/xen/xsm/xsm_core.c 2019-06-06 22:18:26.000000000 +0000 +++ xen-4.11.3+24-g14b62ab3e5/xen/xsm/xsm_core.c 2019-12-11 14:35:39.000000000 +0000 @@ -31,6 +31,42 @@ struct xsm_operations *xsm_ops; +enum xsm_bootparam { + XSM_BOOTPARAM_DUMMY, + XSM_BOOTPARAM_FLASK, + XSM_BOOTPARAM_SILO, +}; + +static enum xsm_bootparam __initdata xsm_bootparam = +#if defined(CONFIG_XSM_FLASK_DEFAULT) + XSM_BOOTPARAM_FLASK; +#elif defined(CONFIG_XSM_SILO_DEFAULT) + XSM_BOOTPARAM_SILO; +#else + XSM_BOOTPARAM_DUMMY; +#endif + +static int __init parse_xsm_param(const char *s) +{ + int rc = 0; + + if ( !strcmp(s, "dummy") ) + xsm_bootparam = XSM_BOOTPARAM_DUMMY; +#ifdef CONFIG_FLASK + else if ( !strcmp(s, "flask") ) + xsm_bootparam = XSM_BOOTPARAM_FLASK; +#endif +#ifdef CONFIG_SILO + else if ( !strcmp(s, "silo") ) + xsm_bootparam = XSM_BOOTPARAM_SILO; +#endif + else + rc = -EINVAL; + + return rc; +} +custom_param("xsm", parse_xsm_param); + static inline int verify(struct xsm_operations *ops) { /* verify the security_operations structure exists */ @@ -57,7 +93,24 @@ } xsm_ops = &dummy_xsm_ops; - flask_init(policy_buffer, policy_size); + + switch ( xsm_bootparam ) + { + case XSM_BOOTPARAM_DUMMY: + break; + + case XSM_BOOTPARAM_FLASK: + flask_init(policy_buffer, policy_size); + break; + + case XSM_BOOTPARAM_SILO: + silo_init(); + break; + + default: + ASSERT_UNREACHABLE(); + break; + } return 0; } @@ -114,7 +167,7 @@ xfree(policy_buffer); - return ret; + return ret ?: (xsm_bootparam == XSM_BOOTPARAM_SILO); } /**