Version in base suite: 4.11.4+24-gddaaccbbab-1~deb10u1 Version in overlay suite: 4.11.4+37-g3263f257ca-1 Base version: xen_4.11.4+37-g3263f257ca-1 Target version: xen_4.11.4+57-g41a822c392-1 Base file: /srv/ftp-master.debian.org/ftp/pool/main/x/xen/xen_4.11.4+37-g3263f257ca-1.dsc Target file: /srv/ftp-master.debian.org/policy/pool/main/x/xen/xen_4.11.4+57-g41a822c392-1.dsc SUPPORT.md | 15 + debian/changelog | 19 + debian/patches/0004-Various-Fix-typo-occured.patch | 4 xen/arch/arm/mm.c | 6 xen/arch/arm/traps.c | 3 xen/arch/x86/irq.c | 6 xen/arch/x86/mm.c | 205 ++++++++++++++++++--- xen/arch/x86/msr.c | 54 +++++ xen/arch/x86/pv/emul-priv-op.c | 14 - xen/arch/x86/pv/shim.c | 9 xen/common/event_channel.c | 141 ++++++++------ xen/common/event_fifo.c | 141 ++++++++------ xen/common/memory.c | 61 +++++- xen/drivers/passthrough/amd/iommu_map.c | 35 ++- xen/include/asm-arm/processor.h | 1 xen/include/asm-x86/msr-index.h | 38 +++ xen/include/xen/event.h | 27 ++ xen/include/xen/mm.h | 16 + xen/include/xen/sched.h | 24 ++ 19 files changed, 625 insertions(+), 194 deletions(-) diff -Nru xen-4.11.4+37-g3263f257ca/SUPPORT.md xen-4.11.4+57-g41a822c392/SUPPORT.md --- xen-4.11.4+37-g3263f257ca/SUPPORT.md 2020-09-22 15:23:04.000000000 +0000 +++ xen-4.11.4+57-g41a822c392/SUPPORT.md 2020-12-01 16:07:03.000000000 +0000 @@ -692,6 +692,21 @@ Status: Supported, not security supported +### qemu-xen-traditional ### + +The Xen Project provides an old version of qemu with modifications +which enable use as a device model stub domain. The old version is +normally selected by default only in a stub dm configuration, but it +can be requested explicitly in other configurations, for example in +`xl` with `device_model_version="QEMU_XEN_TRADITIONAL"`. + + Status, Device Model Stub Domains: Supported, with caveats + Status, as host process device model: No security support, not recommended + +qemu-xen-traditional is security supported only for those available +devices which are supported for mainstream QEMU (see above), with +trusted driver domains (see Device Model Stub Domains). + ## Virtual Firmware ### x86/HVM iPXE diff -Nru xen-4.11.4+37-g3263f257ca/debian/changelog xen-4.11.4+57-g41a822c392/debian/changelog --- xen-4.11.4+37-g3263f257ca/debian/changelog 2020-10-01 12:50:58.000000000 +0000 +++ xen-4.11.4+57-g41a822c392/debian/changelog 2020-12-03 12:56:29.000000000 +0000 @@ -1,3 +1,22 @@ +xen (4.11.4+57-g41a822c392-1) buster-security; urgency=high + + * Update to new upstream version 4.11.4+57-g41a822c392, which also contains + security fixes for the following issues: + - x86: Race condition in Xen mapping code + XSA-345 (CVE-2020-27672) + - undue deferral of IOMMU TLB flushes + XSA-346 (CVE-2020-27671) + - unsafe AMD IOMMU page table updates + XSA-347 (CVE-2020-27670) + - x86 PV guest INVLPG-like flushes may leave stale TLB entries + XSA-286 (CVE-2020-27674) + - Information leak via power sidechannel + XSA-351 (CVE-2020-28368) + - stack corruption from XSA-346 change + XSA-355 (CVE-2020-29040) + + -- Hans van Kranenburg Thu, 03 Dec 2020 13:56:29 +0100 + xen (4.11.4+37-g3263f257ca-1) buster-security; urgency=high * Update to new upstream version 4.11.4+37-g3263f257ca, which also contains diff -Nru xen-4.11.4+37-g3263f257ca/debian/patches/0004-Various-Fix-typo-occured.patch xen-4.11.4+57-g41a822c392/debian/patches/0004-Various-Fix-typo-occured.patch --- xen-4.11.4+37-g3263f257ca/debian/patches/0004-Various-Fix-typo-occured.patch 2020-10-01 12:50:58.000000000 +0000 +++ xen-4.11.4+57-g41a822c392/debian/patches/0004-Various-Fix-typo-occured.patch 2020-12-03 12:56:29.000000000 +0000 @@ -106,10 +106,10 @@ #define VIRQ_ENOMEM 12 /* G. (DOM0) Low on heap memory */ #define VIRQ_XENPMU 13 /* V. PMC interrupt */ diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h -index c0cc5d9..1ce89fc 100644 +index 81af120..98c58c8 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h -@@ -638,7 +638,7 @@ void noreturn __domain_crash_synchronous(void); +@@ -640,7 +640,7 @@ void noreturn __domain_crash_synchronous(void); /* * Called from assembly code, with an optional address to help indicate why diff -Nru xen-4.11.4+37-g3263f257ca/xen/arch/arm/mm.c xen-4.11.4+57-g41a822c392/xen/arch/arm/mm.c --- xen-4.11.4+37-g3263f257ca/xen/arch/arm/mm.c 2020-09-22 15:23:04.000000000 +0000 +++ xen-4.11.4+57-g41a822c392/xen/arch/arm/mm.c 2020-12-01 16:07:03.000000000 +0000 @@ -1222,7 +1222,7 @@ int xenmem_add_to_physmap_one( struct domain *d, unsigned int space, - union xen_add_to_physmap_batch_extra extra, + union add_to_physmap_extra extra, unsigned long idx, gfn_t gfn) { @@ -1294,10 +1294,6 @@ break; } case XENMAPSPACE_dev_mmio: - /* extra should be 0. Reserved for future use. */ - if ( extra.res0 ) - return -EOPNOTSUPP; - rc = map_dev_mmio_region(d, gfn, 1, _mfn(idx)); return rc; diff -Nru xen-4.11.4+37-g3263f257ca/xen/arch/arm/traps.c xen-4.11.4+57-g41a822c392/xen/arch/arm/traps.c --- xen-4.11.4+37-g3263f257ca/xen/arch/arm/traps.c 2020-09-22 15:23:04.000000000 +0000 +++ xen-4.11.4+57-g41a822c392/xen/arch/arm/traps.c 2020-12-01 16:07:03.000000000 +0000 @@ -179,7 +179,8 @@ * On ARM64 the TCPx bits which we set here (0..9,12,13) are all * RES1, i.e. they would trap whether we did this write or not. */ - WRITE_SYSREG((HCPTR_CP_MASK & ~(HCPTR_CP(10) | HCPTR_CP(11))) | HCPTR_TTA, + WRITE_SYSREG((HCPTR_CP_MASK & ~(HCPTR_CP(10) | HCPTR_CP(11))) | + HCPTR_TTA | HCPTR_TAM, CPTR_EL2); /* Setup hypervisor traps */ diff -Nru xen-4.11.4+37-g3263f257ca/xen/arch/x86/irq.c xen-4.11.4+57-g41a822c392/xen/arch/x86/irq.c --- xen-4.11.4+37-g3263f257ca/xen/arch/x86/irq.c 2020-09-22 15:23:04.000000000 +0000 +++ xen-4.11.4+57-g41a822c392/xen/arch/x86/irq.c 2020-12-01 16:07:03.000000000 +0000 @@ -2331,14 +2331,12 @@ pirq = domain_irq_to_pirq(d, irq); info = pirq_info(d, pirq); evtchn = evtchn_from_port(d, info->evtchn); - local_irq_disable(); - if ( spin_trylock(&evtchn->lock) ) + if ( evtchn_read_trylock(evtchn) ) { pending = evtchn_is_pending(d, evtchn); masked = evtchn_is_masked(d, evtchn); - spin_unlock(&evtchn->lock); + evtchn_read_unlock(evtchn); } - local_irq_enable(); printk("%u:%3d(%c%c%c)", d->domain_id, pirq, "-P?"[pending], "-M?"[masked], info->masked ? 'M' : '-'); diff -Nru xen-4.11.4+37-g3263f257ca/xen/arch/x86/mm.c xen-4.11.4+57-g41a822c392/xen/arch/x86/mm.c --- xen-4.11.4+37-g3263f257ca/xen/arch/x86/mm.c 2020-09-22 15:23:04.000000000 +0000 +++ xen-4.11.4+57-g41a822c392/xen/arch/x86/mm.c 2020-12-01 16:07:03.000000000 +0000 @@ -2154,6 +2154,50 @@ } /* + * L3 table locks: + * + * Used for serialization in map_pages_to_xen() and modify_xen_mappings(). + * + * For Xen PT pages, the page->u.inuse.type_info is unused and it is safe to + * reuse the PGT_locked flag. This lock is taken only when we move down to L3 + * tables and below, since L4 (and above, for 5-level paging) is still globally + * protected by map_pgdir_lock. + * + * PV MMU update hypercalls call map_pages_to_xen while holding a page's page_lock(). + * This has two implications: + * - We cannot reuse reuse current_locked_page_* for debugging + * - To avoid the chance of deadlock, even for different pages, we + * must never grab page_lock() after grabbing l3t_lock(). This + * includes any page_lock()-based locks, such as + * mem_sharing_page_lock(). + * + * Also note that we grab the map_pgdir_lock while holding the + * l3t_lock(), so to avoid deadlock we must avoid grabbing them in + * reverse order. + */ +static void l3t_lock(struct page_info *page) +{ + unsigned long x, nx; + + do { + while ( (x = page->u.inuse.type_info) & PGT_locked ) + cpu_relax(); + nx = x | PGT_locked; + } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x ); +} + +static void l3t_unlock(struct page_info *page) +{ + unsigned long x, nx, y = page->u.inuse.type_info; + + do { + x = y; + BUG_ON(!(x & PGT_locked)); + nx = x & ~PGT_locked; + } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x ); +} + +/* * PTE flags that a guest may change without re-validating the PTE. * All other bits affect translation, caching, or Xen's safety. */ @@ -3939,7 +3983,8 @@ struct vcpu *curr = current, *v = curr; struct domain *d = v->domain, *pt_owner = d, *pg_owner; mfn_t map_mfn = INVALID_MFN; - bool sync_guest = false; + bool flush_linear_pt = false, flush_root_pt_local = false, + flush_root_pt_others = false; uint32_t xsm_needed = 0; uint32_t xsm_checked = 0; int rc = put_old_guest_table(curr); @@ -4089,6 +4134,8 @@ break; rc = mod_l2_entry(va, l2e_from_intpte(req.val), mfn, cmd == MMU_PT_UPDATE_PRESERVE_AD, v); + if ( !rc ) + flush_linear_pt = true; break; case PGT_l3_page_table: @@ -4096,6 +4143,8 @@ break; rc = mod_l3_entry(va, l3e_from_intpte(req.val), mfn, cmd == MMU_PT_UPDATE_PRESERVE_AD, v); + if ( !rc ) + flush_linear_pt = true; break; case PGT_l4_page_table: @@ -4103,6 +4152,8 @@ break; rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn, cmd == MMU_PT_UPDATE_PRESERVE_AD, v); + if ( !rc ) + flush_linear_pt = true; if ( !rc && pt_owner->arch.pv_domain.xpti ) { bool local_in_use = false; @@ -4110,7 +4161,7 @@ if ( pagetable_get_pfn(curr->arch.guest_table) == mfn ) { local_in_use = true; - get_cpu_info()->root_pgt_changed = true; + flush_root_pt_local = true; } /* @@ -4122,7 +4173,7 @@ (1 + !!(page->u.inuse.type_info & PGT_pinned) + (pagetable_get_pfn(curr->arch.guest_table_user) == mfn) + local_in_use) ) - sync_guest = true; + flush_root_pt_others = true; } break; @@ -4224,19 +4275,61 @@ if ( va ) unmap_domain_page(va); - if ( sync_guest ) + /* + * Perform required TLB maintenance. + * + * This logic currently depend on flush_linear_pt being a superset of the + * flush_root_pt_* conditions. + * + * pt_owner may not be current->domain. This may occur during + * construction of 32bit PV guests, or debugging of PV guests. The + * behaviour cannot be correct with domain unpaused. We therefore expect + * pt_owner->dirty_cpumask to be empty, but it is a waste of effort to + * explicitly check for, and exclude, this corner case. + * + * flush_linear_pt requires a FLUSH_TLB to all dirty CPUs. The flush must + * be performed now to maintain correct behaviour across a multicall. + * i.e. we cannot relax FLUSH_TLB to FLUSH_ROOT_PGTBL, given that the + * former is a side effect of the latter, because the resync (which is in + * the return-to-guest path) happens too late. + * + * flush_root_pt_* requires FLUSH_ROOT_PGTBL on either the local CPU + * (implies pt_owner == current->domain and current->processor set in + * pt_owner->dirty_cpumask), and/or all *other* dirty CPUs as there are + * references we can't account for locally. + */ + if ( flush_linear_pt /* || flush_root_pt_local || flush_root_pt_others */ ) { + unsigned int cpu = smp_processor_id(); + cpumask_t *mask = pt_owner->dirty_cpumask; + /* - * Force other vCPU-s of the affected guest to pick up L4 entry - * changes (if any). + * Always handle local flushing separately (if applicable), to + * separate the flush invocations appropriately for scope of the two + * flush_root_pt_* variables. */ - unsigned int cpu = smp_processor_id(); - cpumask_t *mask = per_cpu(scratch_cpumask, cpu); + if ( likely(cpumask_test_cpu(cpu, mask)) ) + { + mask = per_cpu(scratch_cpumask, cpu); + + cpumask_copy(mask, pt_owner->dirty_cpumask); + __cpumask_clear_cpu(cpu, mask); + + flush_local(FLUSH_TLB | + (flush_root_pt_local ? FLUSH_ROOT_PGTBL : 0)); + } + else + /* Sanity check. flush_root_pt_local implies local cpu is dirty. */ + ASSERT(!flush_root_pt_local); - cpumask_andnot(mask, pt_owner->dirty_cpumask, cpumask_of(cpu)); + /* Flush the remote dirty CPUs. Does not include the local CPU. */ if ( !cpumask_empty(mask) ) - flush_mask(mask, FLUSH_TLB_GLOBAL | FLUSH_ROOT_PGTBL); + flush_mask(mask, FLUSH_TLB | + (flush_root_pt_others ? FLUSH_ROOT_PGTBL : 0)); } + else + /* Sanity check. flush_root_pt_* implies flush_linear_pt. */ + ASSERT(!flush_root_pt_local && !flush_root_pt_others); perfc_add(num_page_updates, i); @@ -4634,7 +4727,7 @@ int xenmem_add_to_physmap_one( struct domain *d, unsigned int space, - union xen_add_to_physmap_batch_extra extra, + union add_to_physmap_extra extra, unsigned long idx, gfn_t gpfn) { @@ -4721,9 +4814,20 @@ rc = guest_physmap_add_page(d, gpfn, mfn, PAGE_ORDER_4K); put_both: - /* In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top. */ + /* + * In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top. + * We also may need to transfer ownership of the page reference to our + * caller. + */ if ( space == XENMAPSPACE_gmfn ) + { put_gfn(d, gfn); + if ( !rc && extra.ppage ) + { + *extra.ppage = page; + page = NULL; + } + } if ( page ) put_page(page); @@ -5184,6 +5288,23 @@ flush_area_local((const void *)v, f) : \ flush_area_all((const void *)v, f)) +#define L3T_INIT(page) (page) = ZERO_BLOCK_PTR + +#define L3T_LOCK(page) \ + do { \ + if ( locking ) \ + l3t_lock(page); \ + } while ( false ) + +#define L3T_UNLOCK(page) \ + do { \ + if ( locking && (page) != ZERO_BLOCK_PTR ) \ + { \ + l3t_unlock(page); \ + (page) = ZERO_BLOCK_PTR; \ + } \ + } while ( false ) + int map_pages_to_xen( unsigned long virt, mfn_t mfn, @@ -5194,6 +5315,8 @@ l2_pgentry_t *pl2e, ol2e; l1_pgentry_t *pl1e, ol1e; unsigned int i; + int rc = -ENOMEM; + struct page_info *current_l3page; #define flush_flags(oldf) do { \ unsigned int o_ = (oldf); \ @@ -5209,12 +5332,20 @@ } \ } while (0) + L3T_INIT(current_l3page); + while ( nr_mfns != 0 ) { - l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt); + l3_pgentry_t *pl3e, ol3e; + L3T_UNLOCK(current_l3page); + + pl3e = virt_to_xen_l3e(virt); if ( !pl3e ) - return -ENOMEM; + goto out; + + current_l3page = virt_to_page(pl3e); + L3T_LOCK(current_l3page); ol3e = *pl3e; if ( cpu_has_page1gb && @@ -5302,7 +5433,7 @@ pl2e = alloc_xen_pagetable(); if ( pl2e == NULL ) - return -ENOMEM; + goto out; for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) l2e_write(pl2e + i, @@ -5331,7 +5462,7 @@ pl2e = virt_to_xen_l2e(virt); if ( !pl2e ) - return -ENOMEM; + goto out; if ( ((((virt >> PAGE_SHIFT) | mfn_x(mfn)) & ((1u << PAGETABLE_ORDER) - 1)) == 0) && @@ -5374,7 +5505,7 @@ { pl1e = virt_to_xen_l1e(virt); if ( pl1e == NULL ) - return -ENOMEM; + goto out; } else if ( l2e_get_flags(*pl2e) & _PAGE_PSE ) { @@ -5401,7 +5532,7 @@ pl1e = alloc_xen_pagetable(); if ( pl1e == NULL ) - return -ENOMEM; + goto out; for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) l1e_write(&pl1e[i], @@ -5545,7 +5676,11 @@ #undef flush_flags - return 0; + rc = 0; + + out: + L3T_UNLOCK(current_l3page); + return rc; } int populate_pt_range(unsigned long virt, unsigned long nr_mfns) @@ -5572,6 +5707,8 @@ l1_pgentry_t *pl1e; unsigned int i; unsigned long v = s; + int rc = -ENOMEM; + struct page_info *current_l3page; /* Set of valid PTE bits which may be altered. */ #define FLAGS_MASK (_PAGE_NX|_PAGE_RW|_PAGE_PRESENT) @@ -5580,11 +5717,22 @@ ASSERT(IS_ALIGNED(s, PAGE_SIZE)); ASSERT(IS_ALIGNED(e, PAGE_SIZE)); + L3T_INIT(current_l3page); + while ( v < e ) { - l3_pgentry_t *pl3e = virt_to_xen_l3e(v); + l3_pgentry_t *pl3e; + + L3T_UNLOCK(current_l3page); - if ( !pl3e || !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) + pl3e = virt_to_xen_l3e(v); + if ( !pl3e ) + goto out; + + current_l3page = virt_to_page(pl3e); + L3T_LOCK(current_l3page); + + if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) { /* Confirm the caller isn't trying to create new mappings. */ ASSERT(!(nf & _PAGE_PRESENT)); @@ -5613,7 +5761,8 @@ /* PAGE1GB: shatter the superpage and fall through. */ pl2e = alloc_xen_pagetable(); if ( !pl2e ) - return -ENOMEM; + goto out; + for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) l2e_write(pl2e + i, l2e_from_pfn(l3e_get_pfn(*pl3e) + @@ -5668,7 +5817,8 @@ /* PSE: shatter the superpage and try again. */ pl1e = alloc_xen_pagetable(); if ( !pl1e ) - return -ENOMEM; + goto out; + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) l1e_write(&pl1e[i], l1e_from_pfn(l2e_get_pfn(*pl2e) + i, @@ -5797,9 +5947,16 @@ flush_area(NULL, FLUSH_TLB_GLOBAL); #undef FLAGS_MASK - return 0; + rc = 0; + + out: + L3T_UNLOCK(current_l3page); + return rc; } +#undef L3T_LOCK +#undef L3T_UNLOCK + #undef flush_area int destroy_xen_mappings(unsigned long s, unsigned long e) diff -Nru xen-4.11.4+37-g3263f257ca/xen/arch/x86/msr.c xen-4.11.4+57-g41a822c392/xen/arch/x86/msr.c --- xen-4.11.4+37-g3263f257ca/xen/arch/x86/msr.c 2020-09-22 15:23:04.000000000 +0000 +++ xen-4.11.4+57-g41a822c392/xen/arch/x86/msr.c 2020-12-01 16:07:03.000000000 +0000 @@ -141,6 +141,7 @@ int guest_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) { + const struct domain *d = v->domain; const struct cpuid_policy *cp = v->domain->arch.cpuid; const struct msr_domain_policy *dp = v->domain->arch.msr; const struct msr_vcpu_policy *vp = v->arch.msr; @@ -155,6 +156,15 @@ case MSR_TSX_FORCE_ABORT: case MSR_TSX_CTRL: case MSR_MCU_OPT_CTRL: + case MSR_RAPL_POWER_UNIT: + case MSR_PKG_POWER_LIMIT ... MSR_PKG_POWER_INFO: + case MSR_DRAM_POWER_LIMIT ... MSR_DRAM_POWER_INFO: + case MSR_PP0_POWER_LIMIT ... MSR_PP0_POLICY: + case MSR_PP1_POWER_LIMIT ... MSR_PP1_POLICY: + case MSR_PLATFORM_ENERGY_COUNTER: + case MSR_PLATFORM_POWER_LIMIT: + case MSR_F15H_CU_POWER ... MSR_F15H_CU_MAX_POWER: + case MSR_AMD_RAPL_POWER_UNIT ... MSR_AMD_PKG_ENERGY_STATUS: /* Not offered to guests. */ goto gp_fault; @@ -212,6 +222,25 @@ break; /* + * These MSRs are not enumerated in CPUID. They have been around + * since the Pentium 4, and implemented by other vendors. + * + * Some versions of Windows try reading these before setting up a #GP + * handler, and Linux has several unguarded reads as well. Provide + * RAZ semantics, in general, but permit a cpufreq controller dom0 to + * have full access. + */ + case MSR_IA32_PERF_STATUS: + case MSR_IA32_PERF_CTL: + if ( !(cp->x86_vendor & (X86_VENDOR_INTEL | X86_VENDOR_CENTAUR)) ) + goto gp_fault; + + *val = 0; + if ( likely(!is_cpufreq_controller(d)) || rdmsr_safe(msr, *val) == 0 ) + break; + goto gp_fault; + + /* * TODO: Implement when we have better topology representation. case MSR_INTEL_CORE_THREAD_COUNT: */ @@ -241,10 +270,20 @@ case MSR_INTEL_CORE_THREAD_COUNT: case MSR_INTEL_PLATFORM_INFO: case MSR_ARCH_CAPABILITIES: + case MSR_IA32_PERF_STATUS: /* Read-only */ case MSR_TSX_FORCE_ABORT: case MSR_TSX_CTRL: case MSR_MCU_OPT_CTRL: + case MSR_RAPL_POWER_UNIT: + case MSR_PKG_POWER_LIMIT ... MSR_PKG_POWER_INFO: + case MSR_DRAM_POWER_LIMIT ... MSR_DRAM_POWER_INFO: + case MSR_PP0_POWER_LIMIT ... MSR_PP0_POLICY: + case MSR_PP1_POWER_LIMIT ... MSR_PP1_POLICY: + case MSR_PLATFORM_ENERGY_COUNTER: + case MSR_PLATFORM_POWER_LIMIT: + case MSR_F15H_CU_POWER ... MSR_F15H_CU_MAX_POWER: + case MSR_AMD_RAPL_POWER_UNIT ... MSR_AMD_PKG_ENERGY_STATUS: /* Not offered to guests. */ goto gp_fault; @@ -345,6 +384,21 @@ break; } + /* + * This MSR is not enumerated in CPUID. It has been around since the + * Pentium 4, and implemented by other vendors. + * + * To match the RAZ semantics, implement as write-discard, except for + * a cpufreq controller dom0 which has full access. + */ + case MSR_IA32_PERF_CTL: + if ( !(cp->x86_vendor & (X86_VENDOR_INTEL | X86_VENDOR_CENTAUR)) ) + goto gp_fault; + + if ( likely(!is_cpufreq_controller(d)) || wrmsr_safe(msr, val) == 0 ) + break; + goto gp_fault; + default: return X86EMUL_UNHANDLEABLE; } diff -Nru xen-4.11.4+37-g3263f257ca/xen/arch/x86/pv/emul-priv-op.c xen-4.11.4+57-g41a822c392/xen/arch/x86/pv/emul-priv-op.c --- xen-4.11.4+37-g3263f257ca/xen/arch/x86/pv/emul-priv-op.c 2020-09-22 15:23:04.000000000 +0000 +++ xen-4.11.4+57-g41a822c392/xen/arch/x86/pv/emul-priv-op.c 2020-12-01 16:07:03.000000000 +0000 @@ -816,12 +816,6 @@ return val; } -static inline bool is_cpufreq_controller(const struct domain *d) -{ - return ((cpufreq_controller == FREQCTL_dom0_kernel) && - is_hardware_domain(d)); -} - static int read_msr(unsigned int reg, uint64_t *val, struct x86_emulate_ctxt *ctxt) { @@ -1093,14 +1087,6 @@ break; if ( likely(!is_cpufreq_controller(currd)) || wrmsr_safe(reg, val) == 0 ) - return X86EMUL_OKAY; - break; - - case MSR_IA32_PERF_CTL: - if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) - break; - if ( likely(!is_cpufreq_controller(currd)) || - wrmsr_safe(reg, val) == 0 ) return X86EMUL_OKAY; break; diff -Nru xen-4.11.4+37-g3263f257ca/xen/arch/x86/pv/shim.c xen-4.11.4+57-g41a822c392/xen/arch/x86/pv/shim.c --- xen-4.11.4+37-g3263f257ca/xen/arch/x86/pv/shim.c 2020-09-22 15:23:04.000000000 +0000 +++ xen-4.11.4+57-g41a822c392/xen/arch/x86/pv/shim.c 2020-12-01 16:07:03.000000000 +0000 @@ -616,11 +616,12 @@ if ( port_is_valid(guest, port) ) { struct evtchn *chn = evtchn_from_port(guest, port); - unsigned long flags; - spin_lock_irqsave(&chn->lock, flags); - evtchn_port_set_pending(guest, chn->notify_vcpu_id, chn); - spin_unlock_irqrestore(&chn->lock, flags); + if ( evtchn_read_trylock(chn) ) + { + evtchn_port_set_pending(guest, chn->notify_vcpu_id, chn); + evtchn_read_unlock(chn); + } } } diff -Nru xen-4.11.4+37-g3263f257ca/xen/common/event_channel.c xen-4.11.4+57-g41a822c392/xen/common/event_channel.c --- xen-4.11.4+37-g3263f257ca/xen/common/event_channel.c 2020-09-22 15:23:04.000000000 +0000 +++ xen-4.11.4+57-g41a822c392/xen/common/event_channel.c 2020-12-01 16:07:03.000000000 +0000 @@ -51,6 +51,40 @@ #define consumer_is_xen(e) (!!(e)->xen_consumer) /* + * Lock an event channel exclusively. This is allowed only when the channel is + * free or unbound either when taking or when releasing the lock, as any + * concurrent operation on the event channel using evtchn_read_trylock() will + * just assume the event channel is free or unbound at the moment when the + * evtchn_read_trylock() returns false. + */ +static inline void evtchn_write_lock(struct evtchn *evtchn) +{ + write_lock(&evtchn->lock); + +#ifndef NDEBUG + evtchn->old_state = evtchn->state; +#endif +} + +static inline unsigned int old_state(const struct evtchn *evtchn) +{ +#ifndef NDEBUG + return evtchn->old_state; +#else + return ECS_RESERVED; /* Just to allow things to build. */ +#endif +} + +static inline void evtchn_write_unlock(struct evtchn *evtchn) +{ + /* Enforce lock discipline. */ + ASSERT(old_state(evtchn) == ECS_FREE || old_state(evtchn) == ECS_UNBOUND || + evtchn->state == ECS_FREE || evtchn->state == ECS_UNBOUND); + + write_unlock(&evtchn->lock); +} + +/* * The function alloc_unbound_xen_event_channel() allows an arbitrary * notifier function to be specified. However, very few unique functions * are specified in practice, so to prevent bloating the evtchn structure @@ -131,7 +165,7 @@ return NULL; } chn[i].port = port + i; - spin_lock_init(&chn[i].lock); + rwlock_init(&chn[i].lock); } return chn; } @@ -249,7 +283,6 @@ int port; domid_t dom = alloc->dom; long rc; - unsigned long flags; d = rcu_lock_domain_by_any_id(dom); if ( d == NULL ) @@ -265,14 +298,14 @@ if ( rc ) goto out; - spin_lock_irqsave(&chn->lock, flags); + evtchn_write_lock(chn); chn->state = ECS_UNBOUND; if ( (chn->u.unbound.remote_domid = alloc->remote_dom) == DOMID_SELF ) chn->u.unbound.remote_domid = current->domain->domain_id; evtchn_port_init(d, chn); - spin_unlock_irqrestore(&chn->lock, flags); + evtchn_write_unlock(chn); alloc->port = port; @@ -285,32 +318,26 @@ } -static unsigned long double_evtchn_lock(struct evtchn *lchn, - struct evtchn *rchn) +static void double_evtchn_lock(struct evtchn *lchn, struct evtchn *rchn) { - unsigned long flags; - if ( lchn <= rchn ) { - spin_lock_irqsave(&lchn->lock, flags); + evtchn_write_lock(lchn); if ( lchn != rchn ) - spin_lock(&rchn->lock); + evtchn_write_lock(rchn); } else { - spin_lock_irqsave(&rchn->lock, flags); - spin_lock(&lchn->lock); + evtchn_write_lock(rchn); + evtchn_write_lock(lchn); } - - return flags; } -static void double_evtchn_unlock(struct evtchn *lchn, struct evtchn *rchn, - unsigned long flags) +static void double_evtchn_unlock(struct evtchn *lchn, struct evtchn *rchn) { if ( lchn != rchn ) - spin_unlock(&lchn->lock); - spin_unlock_irqrestore(&rchn->lock, flags); + evtchn_write_unlock(lchn); + evtchn_write_unlock(rchn); } static long evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind) @@ -320,7 +347,6 @@ int lport, rport = bind->remote_port; domid_t rdom = bind->remote_dom; long rc; - unsigned long flags; if ( rdom == DOMID_SELF ) rdom = current->domain->domain_id; @@ -356,7 +382,7 @@ if ( rc ) goto out; - flags = double_evtchn_lock(lchn, rchn); + double_evtchn_lock(lchn, rchn); lchn->u.interdomain.remote_dom = rd; lchn->u.interdomain.remote_port = rport; @@ -373,7 +399,7 @@ */ evtchn_port_set_pending(ld, lchn->notify_vcpu_id, lchn); - double_evtchn_unlock(lchn, rchn, flags); + double_evtchn_unlock(lchn, rchn); bind->local_port = lport; @@ -396,7 +422,6 @@ struct domain *d = current->domain; int virq = bind->virq, vcpu = bind->vcpu; int rc = 0; - unsigned long flags; if ( (virq < 0) || (virq >= ARRAY_SIZE(v->virq_to_evtchn)) ) return -EINVAL; @@ -429,14 +454,14 @@ chn = evtchn_from_port(d, port); - spin_lock_irqsave(&chn->lock, flags); + evtchn_write_lock(chn); chn->state = ECS_VIRQ; chn->notify_vcpu_id = vcpu; chn->u.virq = virq; evtchn_port_init(d, chn); - spin_unlock_irqrestore(&chn->lock, flags); + evtchn_write_unlock(chn); v->virq_to_evtchn[virq] = bind->port = port; @@ -453,7 +478,6 @@ struct domain *d = current->domain; int port, vcpu = bind->vcpu; long rc = 0; - unsigned long flags; if ( (vcpu < 0) || (vcpu >= d->max_vcpus) || (d->vcpu[vcpu] == NULL) ) @@ -466,13 +490,13 @@ chn = evtchn_from_port(d, port); - spin_lock_irqsave(&chn->lock, flags); + evtchn_write_lock(chn); chn->state = ECS_IPI; chn->notify_vcpu_id = vcpu; evtchn_port_init(d, chn); - spin_unlock_irqrestore(&chn->lock, flags); + evtchn_write_unlock(chn); bind->port = port; @@ -516,7 +540,6 @@ struct pirq *info; int port = 0, pirq = bind->pirq; long rc; - unsigned long flags; if ( (pirq < 0) || (pirq >= d->nr_pirqs) ) return -EINVAL; @@ -549,14 +572,14 @@ goto out; } - spin_lock_irqsave(&chn->lock, flags); + evtchn_write_lock(chn); chn->state = ECS_PIRQ; chn->u.pirq.irq = pirq; link_pirq_port(port, chn, v); evtchn_port_init(d, chn); - spin_unlock_irqrestore(&chn->lock, flags); + evtchn_write_unlock(chn); bind->port = port; @@ -577,7 +600,6 @@ struct evtchn *chn1, *chn2; int port2; long rc = 0; - unsigned long flags; again: spin_lock(&d1->event_lock); @@ -677,14 +699,14 @@ BUG_ON(chn2->state != ECS_INTERDOMAIN); BUG_ON(chn2->u.interdomain.remote_dom != d1); - flags = double_evtchn_lock(chn1, chn2); + double_evtchn_lock(chn1, chn2); evtchn_free(d1, chn1); chn2->state = ECS_UNBOUND; chn2->u.unbound.remote_domid = d1->domain_id; - double_evtchn_unlock(chn1, chn2, flags); + double_evtchn_unlock(chn1, chn2); goto out; @@ -692,9 +714,9 @@ BUG(); } - spin_lock_irqsave(&chn1->lock, flags); + evtchn_write_lock(chn1); evtchn_free(d1, chn1); - spin_unlock_irqrestore(&chn1->lock, flags); + evtchn_write_unlock(chn1); out: if ( d2 != NULL ) @@ -714,14 +736,13 @@ struct evtchn *lchn, *rchn; struct domain *rd; int rport, ret = 0; - unsigned long flags; if ( !port_is_valid(ld, lport) ) return -EINVAL; lchn = evtchn_from_port(ld, lport); - spin_lock_irqsave(&lchn->lock, flags); + evtchn_read_lock(lchn); /* Guest cannot send via a Xen-attached event channel. */ if ( unlikely(consumer_is_xen(lchn)) ) @@ -756,7 +777,7 @@ } out: - spin_unlock_irqrestore(&lchn->lock, flags); + evtchn_read_unlock(lchn); return ret; } @@ -783,9 +804,11 @@ d = v->domain; chn = evtchn_from_port(d, port); - spin_lock(&chn->lock); - evtchn_port_set_pending(d, v->vcpu_id, chn); - spin_unlock(&chn->lock); + if ( evtchn_read_trylock(chn) ) + { + evtchn_port_set_pending(d, v->vcpu_id, chn); + evtchn_read_unlock(chn); + } out: spin_unlock_irqrestore(&v->virq_lock, flags); @@ -814,9 +837,11 @@ goto out; chn = evtchn_from_port(d, port); - spin_lock(&chn->lock); - evtchn_port_set_pending(d, chn->notify_vcpu_id, chn); - spin_unlock(&chn->lock); + if ( evtchn_read_trylock(chn) ) + { + evtchn_port_set_pending(d, chn->notify_vcpu_id, chn); + evtchn_read_unlock(chn); + } out: spin_unlock_irqrestore(&v->virq_lock, flags); @@ -826,7 +851,6 @@ { int port; struct evtchn *chn; - unsigned long flags; /* * PV guests: It should not be possible to race with __evtchn_close(). The @@ -841,9 +865,11 @@ } chn = evtchn_from_port(d, port); - spin_lock_irqsave(&chn->lock, flags); - evtchn_port_set_pending(d, chn->notify_vcpu_id, chn); - spin_unlock_irqrestore(&chn->lock, flags); + if ( evtchn_read_trylock(chn) ) + { + evtchn_port_set_pending(d, chn->notify_vcpu_id, chn); + evtchn_read_unlock(chn); + } } static struct domain *global_virq_handlers[NR_VIRQS] __read_mostly; @@ -1038,15 +1064,17 @@ { struct domain *d = current->domain; struct evtchn *evtchn; - unsigned long flags; if ( unlikely(!port_is_valid(d, port)) ) return -EINVAL; evtchn = evtchn_from_port(d, port); - spin_lock_irqsave(&evtchn->lock, flags); + + evtchn_read_lock(evtchn); + evtchn_port_unmask(d, evtchn); - spin_unlock_irqrestore(&evtchn->lock, flags); + + evtchn_read_unlock(evtchn); return 0; } @@ -1292,7 +1320,6 @@ { struct evtchn *chn; int port, rc; - unsigned long flags; spin_lock(&ld->event_lock); @@ -1305,14 +1332,14 @@ if ( rc ) goto out; - spin_lock_irqsave(&chn->lock, flags); + evtchn_write_lock(chn); chn->state = ECS_UNBOUND; chn->xen_consumer = get_xen_consumer(notification_fn); chn->notify_vcpu_id = lvcpu; chn->u.unbound.remote_domid = remote_domid; - spin_unlock_irqrestore(&chn->lock, flags); + evtchn_write_unlock(chn); write_atomic(&ld->xen_evtchns, ld->xen_evtchns + 1); @@ -1344,7 +1371,6 @@ { struct evtchn *lchn, *rchn; struct domain *rd; - unsigned long flags; if ( !port_is_valid(ld, lport) ) { @@ -1359,7 +1385,8 @@ lchn = evtchn_from_port(ld, lport); - spin_lock_irqsave(&lchn->lock, flags); + if ( !evtchn_read_trylock(lchn) ) + return; if ( likely(lchn->state == ECS_INTERDOMAIN) ) { @@ -1369,7 +1396,7 @@ evtchn_port_set_pending(rd, rchn->notify_vcpu_id, rchn); } - spin_unlock_irqrestore(&lchn->lock, flags); + evtchn_read_unlock(lchn); } void evtchn_check_pollers(struct domain *d, unsigned int port) diff -Nru xen-4.11.4+37-g3263f257ca/xen/common/event_fifo.c xen-4.11.4+57-g41a822c392/xen/common/event_fifo.c --- xen-4.11.4+37-g3263f257ca/xen/common/event_fifo.c 2020-09-22 15:23:04.000000000 +0000 +++ xen-4.11.4+57-g41a822c392/xen/common/event_fifo.c 2020-12-01 16:07:03.000000000 +0000 @@ -21,6 +21,14 @@ #include +union evtchn_fifo_lastq { + uint32_t raw; + struct { + uint8_t last_priority; + uint16_t last_vcpu_id; + }; +}; + static inline event_word_t *evtchn_fifo_word_from_port(const struct domain *d, unsigned int port) { @@ -57,36 +65,6 @@ d->domain_id, evtchn->port); } -static struct evtchn_fifo_queue *lock_old_queue(const struct domain *d, - struct evtchn *evtchn, - unsigned long *flags) -{ - struct vcpu *v; - struct evtchn_fifo_queue *q, *old_q; - unsigned int try; - - for ( try = 0; try < 3; try++ ) - { - v = d->vcpu[evtchn->last_vcpu_id]; - old_q = &v->evtchn_fifo->queue[evtchn->last_priority]; - - spin_lock_irqsave(&old_q->lock, *flags); - - v = d->vcpu[evtchn->last_vcpu_id]; - q = &v->evtchn_fifo->queue[evtchn->last_priority]; - - if ( old_q == q ) - return old_q; - - spin_unlock_irqrestore(&old_q->lock, *flags); - } - - gprintk(XENLOG_WARNING, - "dom%d port %d lost event (too many queue changes)\n", - d->domain_id, evtchn->port); - return NULL; -} - static int try_set_link(event_word_t *word, event_word_t *w, uint32_t link) { event_word_t new, old; @@ -158,6 +136,9 @@ event_word_t *word; unsigned long flags; bool_t was_pending; + struct evtchn_fifo_queue *q, *old_q; + unsigned int try; + bool linked = true; port = evtchn->port; word = evtchn_fifo_word_from_port(d, port); @@ -172,17 +153,67 @@ return; } + /* + * Lock all queues related to the event channel (in case of a queue change + * this might be two). + * It is mandatory to do that before setting and testing the PENDING bit + * and to hold the current queue lock until the event has been put into the + * list of pending events in order to avoid waking up a guest without the + * event being visibly pending in the guest. + */ + for ( try = 0; try < 3; try++ ) + { + union evtchn_fifo_lastq lastq; + const struct vcpu *old_v; + + lastq.raw = read_atomic(&evtchn->fifo_lastq); + old_v = d->vcpu[lastq.last_vcpu_id]; + + q = &v->evtchn_fifo->queue[evtchn->priority]; + old_q = &old_v->evtchn_fifo->queue[lastq.last_priority]; + + if ( q == old_q ) + spin_lock_irqsave(&q->lock, flags); + else if ( q < old_q ) + { + spin_lock_irqsave(&q->lock, flags); + spin_lock(&old_q->lock); + } + else + { + spin_lock_irqsave(&old_q->lock, flags); + spin_lock(&q->lock); + } + + lastq.raw = read_atomic(&evtchn->fifo_lastq); + old_v = d->vcpu[lastq.last_vcpu_id]; + if ( q == &v->evtchn_fifo->queue[evtchn->priority] && + old_q == &old_v->evtchn_fifo->queue[lastq.last_priority] ) + break; + + if ( q != old_q ) + spin_unlock(&old_q->lock); + spin_unlock_irqrestore(&q->lock, flags); + } + was_pending = guest_test_and_set_bit(d, EVTCHN_FIFO_PENDING, word); + /* If we didn't get the lock bail out. */ + if ( try == 3 ) + { + gprintk(XENLOG_WARNING, + "%pd port %u lost event (too many queue changes)\n", + d, evtchn->port); + goto done; + } + /* * Link the event if it unmasked and not already linked. */ if ( !guest_test_bit(d, EVTCHN_FIFO_MASKED, word) && !guest_test_bit(d, EVTCHN_FIFO_LINKED, word) ) { - struct evtchn_fifo_queue *q, *old_q; event_word_t *tail_word; - bool_t linked = 0; /* * Control block not mapped. The guest must not unmask an @@ -193,25 +224,11 @@ { printk(XENLOG_G_WARNING "%pv has no FIFO event channel control block\n", v); - goto done; + goto unlock; } - /* - * No locking around getting the queue. This may race with - * changing the priority but we are allowed to signal the - * event once on the old priority. - */ - q = &v->evtchn_fifo->queue[evtchn->priority]; - - old_q = lock_old_queue(d, evtchn, &flags); - if ( !old_q ) - goto done; - if ( guest_test_and_set_bit(d, EVTCHN_FIFO_LINKED, word) ) - { - spin_unlock_irqrestore(&old_q->lock, flags); - goto done; - } + goto unlock; /* * If this event was a tail, the old queue is now empty and @@ -224,11 +241,14 @@ /* Moved to a different queue? */ if ( old_q != q ) { - evtchn->last_vcpu_id = evtchn->notify_vcpu_id; - evtchn->last_priority = evtchn->priority; + union evtchn_fifo_lastq lastq = { }; - spin_unlock_irqrestore(&old_q->lock, flags); - spin_lock_irqsave(&q->lock, flags); + lastq.last_vcpu_id = v->vcpu_id; + lastq.last_priority = q->priority; + write_atomic(&evtchn->fifo_lastq, lastq.raw); + + spin_unlock(&old_q->lock); + old_q = q; } /* @@ -241,6 +261,7 @@ * If the queue is empty (i.e., we haven't linked to the new * event), head must be updated. */ + linked = false; if ( q->tail ) { tail_word = evtchn_fifo_word_from_port(d, q->tail); @@ -249,15 +270,19 @@ if ( !linked ) write_atomic(q->head, port); q->tail = port; + } - spin_unlock_irqrestore(&q->lock, flags); + unlock: + if ( q != old_q ) + spin_unlock(&old_q->lock); + spin_unlock_irqrestore(&q->lock, flags); - if ( !linked - && !guest_test_and_set_bit(d, q->priority, - &v->evtchn_fifo->control_block->ready) ) - vcpu_mark_events_pending(v); - } done: + if ( !linked && + !guest_test_and_set_bit(d, q->priority, + &v->evtchn_fifo->control_block->ready) ) + vcpu_mark_events_pending(v); + if ( !was_pending ) evtchn_check_pollers(d, port); } diff -Nru xen-4.11.4+37-g3263f257ca/xen/common/memory.c xen-4.11.4+57-g41a822c392/xen/common/memory.c --- xen-4.11.4+37-g3263f257ca/xen/common/memory.c 2020-09-22 15:23:04.000000000 +0000 +++ xen-4.11.4+57-g41a822c392/xen/common/memory.c 2020-12-01 16:07:03.000000000 +0000 @@ -298,7 +298,10 @@ p2m_type_t p2mt; #endif mfn_t mfn; +#ifdef CONFIG_HAS_PASSTHROUGH + bool *dont_flush_p, dont_flush; int rc; +#endif #ifdef CONFIG_X86 mfn = get_gfn_query(d, gmfn, &p2mt); @@ -376,8 +379,22 @@ return -ENXIO; } +#ifdef CONFIG_HAS_PASSTHROUGH + /* + * Since we're likely to free the page below, we need to suspend + * xenmem_add_to_physmap()'s suppressing of IOMMU TLB flushes. + */ + dont_flush_p = &this_cpu(iommu_dont_flush_iotlb); + dont_flush = *dont_flush_p; + *dont_flush_p = false; +#endif + rc = guest_physmap_remove_page(d, _gfn(gmfn), mfn, 0); +#ifdef CONFIG_HAS_PASSTHROUGH + *dont_flush_p = dont_flush; +#endif + /* * With the lack of an IOMMU on some platforms, domains with DMA-capable * device must retrieve the same pfn when the hypercall populate_physmap @@ -794,11 +811,10 @@ { unsigned int done = 0; long rc = 0; - union xen_add_to_physmap_batch_extra extra; + union add_to_physmap_extra extra = {}; + struct page_info *pages[16]; - if ( xatp->space != XENMAPSPACE_gmfn_foreign ) - extra.res0 = 0; - else + if ( xatp->space == XENMAPSPACE_gmfn_foreign ) extra.foreign_domid = DOMID_INVALID; if ( xatp->space != XENMAPSPACE_gmfn_range ) @@ -814,7 +830,10 @@ #ifdef CONFIG_HAS_PASSTHROUGH if ( need_iommu(d) ) + { this_cpu(iommu_dont_flush_iotlb) = 1; + extra.ppage = &pages[0]; + } #endif while ( xatp->size > done ) @@ -827,8 +846,12 @@ xatp->idx++; xatp->gpfn++; + if ( extra.ppage ) + ++extra.ppage; + /* Check for continuation if it's not the last iteration. */ - if ( xatp->size > ++done && hypercall_preempt_check() ) + if ( (++done >= ARRAY_SIZE(pages) && extra.ppage) || + (xatp->size > done && hypercall_preempt_check()) ) { rc = start + done; break; @@ -839,6 +862,7 @@ if ( need_iommu(d) ) { int ret; + unsigned int i; this_cpu(iommu_dont_flush_iotlb) = 0; @@ -846,6 +870,15 @@ if ( unlikely(ret) && rc >= 0 ) rc = ret; + /* + * Now that the IOMMU TLB flush was done for the original GFN, drop + * the page references. The 2nd flush below is fine to make later, as + * whoever removes the page again from its new GFN will have to do + * another flush anyway. + */ + for ( i = 0; i < done; ++i ) + put_page(pages[i]); + ret = iommu_iotlb_flush(d, xatp->gpfn - done, done); if ( unlikely(ret) && rc >= 0 ) rc = ret; @@ -859,6 +892,8 @@ struct xen_add_to_physmap_batch *xatpb, unsigned int extent) { + union add_to_physmap_extra extra = {}; + if ( xatpb->size < extent ) return -EILSEQ; @@ -867,6 +902,19 @@ !guest_handle_subrange_okay(xatpb->errs, extent, xatpb->size - 1) ) return -EFAULT; + switch ( xatpb->space ) + { + case XENMAPSPACE_dev_mmio: + /* res0 is reserved for future use. */ + if ( xatpb->u.res0 ) + return -EOPNOTSUPP; + break; + + case XENMAPSPACE_gmfn_foreign: + extra.foreign_domid = xatpb->u.foreign_domid; + break; + } + while ( xatpb->size > extent ) { xen_ulong_t idx; @@ -879,8 +927,7 @@ extent, 1)) ) return -EFAULT; - rc = xenmem_add_to_physmap_one(d, xatpb->space, - xatpb->u, + rc = xenmem_add_to_physmap_one(d, xatpb->space, extra, idx, _gfn(gpfn)); if ( unlikely(__copy_to_guest_offset(xatpb->errs, extent, &rc, 1)) ) diff -Nru xen-4.11.4+37-g3263f257ca/xen/drivers/passthrough/amd/iommu_map.c xen-4.11.4+57-g41a822c392/xen/drivers/passthrough/amd/iommu_map.c --- xen-4.11.4+37-g3263f257ca/xen/drivers/passthrough/amd/iommu_map.c 2020-09-22 15:23:04.000000000 +0000 +++ xen-4.11.4+57-g41a822c392/xen/drivers/passthrough/amd/iommu_map.c 2020-12-01 16:07:03.000000000 +0000 @@ -41,7 +41,7 @@ table = map_domain_page(_mfn(l1_mfn)); pte = table + pfn_to_pde_idx(gfn, IOMMU_PAGING_MODE_LEVEL_1); - *pte = 0; + write_atomic(pte, 0); unmap_domain_page(table); } @@ -49,7 +49,7 @@ unsigned int next_level, bool_t iw, bool_t ir) { - uint64_t addr_lo, addr_hi, maddr_next; + uint64_t addr_lo, addr_hi, maddr_next, full; u32 entry; bool need_flush = false, old_present; @@ -106,7 +106,7 @@ if ( next_level == IOMMU_PAGING_MODE_LEVEL_0 ) set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, IOMMU_PTE_FC_MASK, IOMMU_PTE_FC_SHIFT, &entry); - pde[1] = entry; + full = (uint64_t)entry << 32; /* mark next level as 'present' */ set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, @@ -118,7 +118,9 @@ set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, IOMMU_PDE_PRESENT_MASK, IOMMU_PDE_PRESENT_SHIFT, &entry); - pde[0] = entry; + full |= entry; + + write_atomic((uint64_t *)pde, full); return need_flush; } @@ -145,7 +147,22 @@ u32 *dte, u64 root_ptr, u16 domain_id, u8 paging_mode, u8 valid) { u64 addr_hi, addr_lo; - u32 entry; + u32 entry, dte0 = dte[0]; + + if ( valid || + get_field_from_reg_u32(dte0, IOMMU_DEV_TABLE_VALID_MASK, + IOMMU_DEV_TABLE_VALID_SHIFT) ) + { + set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, dte0, + IOMMU_DEV_TABLE_TRANSLATION_VALID_MASK, + IOMMU_DEV_TABLE_TRANSLATION_VALID_SHIFT, &dte0); + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, dte0, + IOMMU_DEV_TABLE_VALID_MASK, + IOMMU_DEV_TABLE_VALID_SHIFT, &dte0); + dte[0] = dte0; + smp_wmb(); + } + set_field_in_reg_u32(domain_id, 0, IOMMU_DEV_TABLE_DOMAIN_ID_MASK, IOMMU_DEV_TABLE_DOMAIN_ID_SHIFT, &entry); @@ -164,8 +181,9 @@ IOMMU_DEV_TABLE_IO_READ_PERMISSION_MASK, IOMMU_DEV_TABLE_IO_READ_PERMISSION_SHIFT, &entry); dte[1] = entry; + smp_wmb(); - set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, + set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, dte0, IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_MASK, IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_SHIFT, &entry); set_field_in_reg_u32(paging_mode, entry, @@ -178,7 +196,7 @@ IOMMU_CONTROL_DISABLED, entry, IOMMU_DEV_TABLE_VALID_MASK, IOMMU_DEV_TABLE_VALID_SHIFT, &entry); - dte[0] = entry; + write_atomic(&dte[0], entry); } void iommu_dte_set_iotlb(u32 *dte, u8 i) @@ -210,6 +228,7 @@ IOMMU_DEV_TABLE_INT_CONTROL_MASK, IOMMU_DEV_TABLE_INT_CONTROL_SHIFT, &entry); dte[5] = entry; + smp_wmb(); set_field_in_reg_u32((u32)addr_lo >> 6, 0, IOMMU_DEV_TABLE_INT_TABLE_PTR_LOW_MASK, @@ -227,7 +246,7 @@ IOMMU_CONTROL_DISABLED, entry, IOMMU_DEV_TABLE_INT_VALID_MASK, IOMMU_DEV_TABLE_INT_VALID_SHIFT, &entry); - dte[4] = entry; + write_atomic(&dte[4], entry); } void __init iommu_dte_add_device_entry(u32 *dte, struct ivrs_mappings *ivrs_dev) diff -Nru xen-4.11.4+37-g3263f257ca/xen/include/asm-arm/processor.h xen-4.11.4+57-g41a822c392/xen/include/asm-arm/processor.h --- xen-4.11.4+37-g3263f257ca/xen/include/asm-arm/processor.h 2020-09-22 15:23:04.000000000 +0000 +++ xen-4.11.4+57-g41a822c392/xen/include/asm-arm/processor.h 2020-12-01 16:07:03.000000000 +0000 @@ -291,6 +291,7 @@ #define VTCR_RES1 (_AC(1,UL)<<31) /* HCPTR Hyp. Coprocessor Trap Register */ +#define HCPTR_TAM ((_AC(1,U)<<30)) #define HCPTR_TTA ((_AC(1,U)<<20)) /* Trap trace registers */ #define HCPTR_CP(x) ((_AC(1,U)<<(x))) /* Trap Coprocessor x */ #define HCPTR_CP_MASK ((_AC(1,U)<<14)-1) diff -Nru xen-4.11.4+37-g3263f257ca/xen/include/asm-x86/msr-index.h xen-4.11.4+57-g41a822c392/xen/include/asm-x86/msr-index.h --- xen-4.11.4+37-g3263f257ca/xen/include/asm-x86/msr-index.h 2020-09-22 15:23:04.000000000 +0000 +++ xen-4.11.4+57-g41a822c392/xen/include/asm-x86/msr-index.h 2020-12-01 16:07:03.000000000 +0000 @@ -96,6 +96,38 @@ /* Lower 6 bits define the format of the address in the LBR stack */ #define MSR_IA32_PERF_CAP_LBR_FORMAT 0x3f +/* + * Intel Runtime Average Power Limiting (RAPL) interface. Power plane base + * addresses (MSR_*_POWER_LIMIT) are model specific, but have so-far been + * consistent since their introduction in SandyBridge. + * + * Offsets of functionality from the power plane base is architectural, but + * not all power planes support all functionality. + */ +#define MSR_RAPL_POWER_UNIT 0x00000606 + +#define MSR_PKG_POWER_LIMIT 0x00000610 +#define MSR_PKG_ENERGY_STATUS 0x00000611 +#define MSR_PKG_PERF_STATUS 0x00000613 +#define MSR_PKG_POWER_INFO 0x00000614 + +#define MSR_DRAM_POWER_LIMIT 0x00000618 +#define MSR_DRAM_ENERGY_STATUS 0x00000619 +#define MSR_DRAM_PERF_STATUS 0x0000061b +#define MSR_DRAM_POWER_INFO 0x0000061c + +#define MSR_PP0_POWER_LIMIT 0x00000638 +#define MSR_PP0_ENERGY_STATUS 0x00000639 +#define MSR_PP0_POLICY 0x0000063a + +#define MSR_PP1_POWER_LIMIT 0x00000640 +#define MSR_PP1_ENERGY_STATUS 0x00000641 +#define MSR_PP1_POLICY 0x00000642 + +/* Intel Platform-wide power interface. */ +#define MSR_PLATFORM_ENERGY_COUNTER 0x0000064d +#define MSR_PLATFORM_POWER_LIMIT 0x0000065c + #define MSR_IA32_BNDCFGS 0x00000d90 #define IA32_BNDCFGS_ENABLE 0x00000001 #define IA32_BNDCFGS_PRESERVE 0x00000002 @@ -218,6 +250,8 @@ #define MSR_K8_VM_CR 0xc0010114 #define MSR_K8_VM_HSAVE_PA 0xc0010117 +#define MSR_F15H_CU_POWER 0xc001007a +#define MSR_F15H_CU_MAX_POWER 0xc001007b #define MSR_AMD_FAM15H_EVNTSEL0 0xc0010200 #define MSR_AMD_FAM15H_PERFCTR0 0xc0010201 #define MSR_AMD_FAM15H_EVNTSEL1 0xc0010202 @@ -231,6 +265,10 @@ #define MSR_AMD_FAM15H_EVNTSEL5 0xc001020a #define MSR_AMD_FAM15H_PERFCTR5 0xc001020b +#define MSR_AMD_RAPL_POWER_UNIT 0xc0010299 +#define MSR_AMD_CORE_ENERGY_STATUS 0xc001029a +#define MSR_AMD_PKG_ENERGY_STATUS 0xc001029b + #define MSR_AMD_L7S0_FEATURE_MASK 0xc0011002 #define MSR_AMD_THRM_FEATURE_MASK 0xc0011003 #define MSR_K8_FEATURE_MASK 0xc0011004 diff -Nru xen-4.11.4+37-g3263f257ca/xen/include/xen/event.h xen-4.11.4+57-g41a822c392/xen/include/xen/event.h --- xen-4.11.4+37-g3263f257ca/xen/include/xen/event.h 2020-09-22 15:23:04.000000000 +0000 +++ xen-4.11.4+57-g41a822c392/xen/include/xen/event.h 2020-12-01 16:07:03.000000000 +0000 @@ -103,6 +103,21 @@ : BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d); } +static inline void evtchn_read_lock(struct evtchn *evtchn) +{ + read_lock(&evtchn->lock); +} + +static inline bool evtchn_read_trylock(struct evtchn *evtchn) +{ + return read_trylock(&evtchn->lock); +} + +static inline void evtchn_read_unlock(struct evtchn *evtchn) +{ + read_unlock(&evtchn->lock); +} + static inline bool_t port_is_valid(struct domain *d, unsigned int p) { if ( p >= read_atomic(&d->valid_evtchns) ) @@ -236,11 +251,10 @@ { struct evtchn *evtchn = evtchn_from_port(d, port); bool rc; - unsigned long flags; - spin_lock_irqsave(&evtchn->lock, flags); + evtchn_read_lock(evtchn); rc = evtchn_is_pending(d, evtchn); - spin_unlock_irqrestore(&evtchn->lock, flags); + evtchn_read_unlock(evtchn); return rc; } @@ -255,11 +269,12 @@ { struct evtchn *evtchn = evtchn_from_port(d, port); bool rc; - unsigned long flags; - spin_lock_irqsave(&evtchn->lock, flags); + evtchn_read_lock(evtchn); + rc = evtchn_is_masked(d, evtchn); - spin_unlock_irqrestore(&evtchn->lock, flags); + + evtchn_read_unlock(evtchn); return rc; } diff -Nru xen-4.11.4+37-g3263f257ca/xen/include/xen/mm.h xen-4.11.4+57-g41a822c392/xen/include/xen/mm.h --- xen-4.11.4+37-g3263f257ca/xen/include/xen/mm.h 2020-09-22 15:23:04.000000000 +0000 +++ xen-4.11.4+57-g41a822c392/xen/include/xen/mm.h 2020-12-01 16:07:03.000000000 +0000 @@ -577,8 +577,22 @@ &(d)->xenpage_list : &(d)->page_list) #endif +union add_to_physmap_extra { + /* + * XENMAPSPACE_gmfn: When deferring TLB flushes, a page reference needs + * to be kept until after the flush, so the page can't get removed from + * the domain (and re-used for another purpose) beforehand. By passing + * non-NULL, the caller of xenmem_add_to_physmap_one() indicates it wants + * to have ownership of such a reference transferred in the success case. + */ + struct page_info **ppage; + + /* XENMAPSPACE_gmfn_foreign */ + domid_t foreign_domid; +}; + int xenmem_add_to_physmap_one(struct domain *d, unsigned int space, - union xen_add_to_physmap_batch_extra extra, + union add_to_physmap_extra extra, unsigned long idx, gfn_t gfn); int xenmem_add_to_physmap(struct domain *d, struct xen_add_to_physmap *xatp, diff -Nru xen-4.11.4+37-g3263f257ca/xen/include/xen/sched.h xen-4.11.4+57-g41a822c392/xen/include/xen/sched.h --- xen-4.11.4+37-g3263f257ca/xen/include/xen/sched.h 2020-09-22 15:23:04.000000000 +0000 +++ xen-4.11.4+57-g41a822c392/xen/include/xen/sched.h 2020-12-01 16:07:03.000000000 +0000 @@ -82,7 +82,7 @@ struct evtchn { - spinlock_t lock; + rwlock_t lock; #define ECS_FREE 0 /* Channel is available for use. */ #define ECS_RESERVED 1 /* Channel is reserved. */ #define ECS_UNBOUND 2 /* Channel is waiting to bind to a remote domain. */ @@ -111,8 +111,10 @@ u16 virq; /* state == ECS_VIRQ */ } u; u8 priority; - u8 last_priority; - u16 last_vcpu_id; +#ifndef NDEBUG + u8 old_state; /* State when taking lock in write mode. */ +#endif + u32 fifo_lastq; /* Data for fifo events identifying last queue. */ #ifdef CONFIG_XSM union { #ifdef XSM_NEED_GENERIC_EVTCHN_SSID @@ -920,6 +922,22 @@ FREQCTL_none, FREQCTL_dom0_kernel, FREQCTL_xen } cpufreq_controller; +static always_inline bool is_cpufreq_controller(const struct domain *d) +{ + /* + * A PV dom0 can be nominated as the cpufreq controller, instead of using + * Xen's cpufreq driver, at which point dom0 gets direct access to certain + * MSRs. + * + * This interface only works when dom0 is identity pinned and has the same + * number of vCPUs as pCPUs on the system. + * + * It would be far better to paravirtualise the interface. + */ + return (is_pv_domain(d) && is_hardware_domain(d) && + cpufreq_controller == FREQCTL_dom0_kernel); +} + #define CPUPOOLID_NONE -1 struct cpupool *cpupool_get_by_id(int poolid);