From b244ce9e91dc52674512682b54282312ed7ad5a6cbac2cff945a88e06a4ad0de Mon Sep 17 00:00:00 2001 From: Charles Arnold Date: Thu, 24 Oct 2013 21:00:35 +0000 Subject: [PATCH] - domUloader can no longer be used with the xl toolstack to boot sles10. Patch pygrub to get the kernel and initrd from the image. pygrub-boot-legacy-sles.patch - bnc#842515 - VUL-0: CVE-2013-4375: XSA-71: xen: qemu disk backend (qdisk) resource leak CVE-2013-4375-xsa71.patch - Upstream patches from Jan 52496bea-x86-properly-handle-hvm_copy_from_guest_-phys-virt-errors.patch (Replaces CVE-2013-4355-xsa63.patch) 52496c11-x86-mm-shadow-Fix-initialization-of-PV-shadow-L4-tables.patch (Replaces CVE-2013-4356-xsa64.patch) 52496c32-x86-properly-set-up-fbld-emulation-operand-address.patch (Replaces CVE-2013-4361-xsa66.patch) 52497c6c-x86-don-t-blindly-create-L3-tables-for-the-direct-map.patch 524e971b-x86-idle-Fix-get_cpu_idle_time-s-interaction-with-offline-pcpus.patch 524e9762-x86-percpu-Force-INVALID_PERCPU_AREA-to-non-canonical.patch 524e983e-Nested-VMX-check-VMX-capability-before-read-VMX-related-MSRs.patch 524e98b1-Nested-VMX-fix-IA32_VMX_CR4_FIXED1-msr-emulation.patch 524e9dc0-xsm-forbid-PV-guest-console-reads.patch 5256a979-x86-check-segment-descriptor-read-result-in-64-bit-OUTS-emulation.patch 5256be57-libxl-fix-vif-rate-parsing.patch 5256be84-tools-ocaml-fix-erroneous-free-of-cpumap-in-stub_xc_vcpu_getaffinity.patch 5256be92-libxl-fix-out-of-memory-error-handling-in-libxl_list_cpupool.patch 5257a89a-x86-correct-LDT-checks.patch 5257a8e7-x86-add-address-validity-check-to-guest_map_l1e.patch 5257a944-x86-check-for-canonical-address-before-doing-page-walks.patch 525b95f4-scheduler-adjust-internal-locking-interface.patch 525b9617-sched-fix-race-between-sched_move_domain-and-vcpu_wake.patch 525e69e8-credit-unpause-parked-vcpu-before-destroying-it.patch 525faf5e-x86-print-relevant-tail-part-of-filename-for-warnings-and-crashes.patch - bnc#840196 - L3: MTU size on Dom0 gets reset when booting DomU OBS-URL: https://build.opensuse.org/package/show/Virtualization/xen?expand=0&rev=276 --- ...vm_copy_from_guest_-phys-virt-errors.patch | 4 + ...nitialization-of-PV-shadow-L4-tables.patch | 16 +- ...et-up-fbld-emulation-operand-address.patch | 4 + ...-create-L3-tables-for-the-direct-map.patch | 116 ++++ ...ime-s-interaction-with-offline-pcpus.patch | 82 +++ ...INVALID_PERCPU_AREA-to-non-canonical.patch | 35 + ...ability-before-read-VMX-related-MSRs.patch | 82 +++ ...ix-IA32_VMX_CR4_FIXED1-msr-emulation.patch | 115 ++++ ...c0-xsm-forbid-PV-guest-console-reads.patch | 28 + ...read-result-in-64-bit-OUTS-emulation.patch | 43 ++ 5256be57-libxl-fix-vif-rate-parsing.patch | 71 ++ ...f-cpumap-in-stub_xc_vcpu_getaffinity.patch | 28 + ...error-handling-in-libxl_list_cpupool.patch | 28 + 5257a89a-x86-correct-LDT-checks.patch | 176 +++++ ...ress-validity-check-to-guest_map_l1e.patch | 26 + ...ical-address-before-doing-page-walks.patch | 38 ++ ...er-adjust-internal-locking-interface.patch | 632 ++++++++++++++++++ ...ween-sched_move_domain-and-vcpu_wake.patch | 63 ++ ...use-parked-vcpu-before-destroying-it.patch | 27 + ...of-filename-for-warnings-and-crashes.patch | 77 +++ CVE-2013-4375-xsa71.patch | 33 + pygrub-boot-legacy-sles.patch | 34 + set-mtu-from-bridge-for-tap-interface.patch | 61 ++ x86-cpufreq-report.patch | 2 +- xen.changes | 42 ++ xen.spec | 54 +- 26 files changed, 1902 insertions(+), 15 deletions(-) rename CVE-2013-4355-xsa63.patch => 52496bea-x86-properly-handle-hvm_copy_from_guest_-phys-virt-errors.patch (97%) rename CVE-2013-4356-xsa64.patch => 52496c11-x86-mm-shadow-Fix-initialization-of-PV-shadow-L4-tables.patch (92%) rename CVE-2013-4361-xsa66.patch => 52496c32-x86-properly-set-up-fbld-emulation-operand-address.patch (84%) create mode 100644 52497c6c-x86-don-t-blindly-create-L3-tables-for-the-direct-map.patch create mode 100644 524e971b-x86-idle-Fix-get_cpu_idle_time-s-interaction-with-offline-pcpus.patch create mode 100644 524e9762-x86-percpu-Force-INVALID_PERCPU_AREA-to-non-canonical.patch create mode 100644 524e983e-Nested-VMX-check-VMX-capability-before-read-VMX-related-MSRs.patch create mode 100644 524e98b1-Nested-VMX-fix-IA32_VMX_CR4_FIXED1-msr-emulation.patch create mode 100644 524e9dc0-xsm-forbid-PV-guest-console-reads.patch create mode 100644 5256a979-x86-check-segment-descriptor-read-result-in-64-bit-OUTS-emulation.patch create mode 100644 5256be57-libxl-fix-vif-rate-parsing.patch create mode 100644 5256be84-tools-ocaml-fix-erroneous-free-of-cpumap-in-stub_xc_vcpu_getaffinity.patch create mode 100644 5256be92-libxl-fix-out-of-memory-error-handling-in-libxl_list_cpupool.patch create mode 100644 5257a89a-x86-correct-LDT-checks.patch create mode 100644 5257a8e7-x86-add-address-validity-check-to-guest_map_l1e.patch create mode 100644 5257a944-x86-check-for-canonical-address-before-doing-page-walks.patch create mode 100644 525b95f4-scheduler-adjust-internal-locking-interface.patch create mode 100644 525b9617-sched-fix-race-between-sched_move_domain-and-vcpu_wake.patch create mode 100644 525e69e8-credit-unpause-parked-vcpu-before-destroying-it.patch create mode 100644 525faf5e-x86-print-relevant-tail-part-of-filename-for-warnings-and-crashes.patch create mode 100644 CVE-2013-4375-xsa71.patch create mode 100644 pygrub-boot-legacy-sles.patch create mode 100644 set-mtu-from-bridge-for-tap-interface.patch diff --git a/CVE-2013-4355-xsa63.patch b/52496bea-x86-properly-handle-hvm_copy_from_guest_-phys-virt-errors.patch similarity index 97% rename from CVE-2013-4355-xsa63.patch rename to 52496bea-x86-properly-handle-hvm_copy_from_guest_-phys-virt-errors.patch index 48b7c0c..1aa9ec5 100644 --- a/CVE-2013-4355-xsa63.patch +++ b/52496bea-x86-properly-handle-hvm_copy_from_guest_-phys-virt-errors.patch @@ -1,5 +1,9 @@ References: bnc#840592 CVE-2013-4355 XSA-63 +# Commit 6bb838e7375f5b031e9ac346b353775c90de45dc +# Date 2013-09-30 14:17:46 +0200 +# Author Jan Beulich +# Committer Jan Beulich x86: properly handle hvm_copy_from_guest_{phys,virt}() errors Ignoring them generally implies using uninitialized data and, in all diff --git a/CVE-2013-4356-xsa64.patch b/52496c11-x86-mm-shadow-Fix-initialization-of-PV-shadow-L4-tables.patch similarity index 92% rename from CVE-2013-4356-xsa64.patch rename to 52496c11-x86-mm-shadow-Fix-initialization-of-PV-shadow-L4-tables.patch index 5def99b..5aa4b7e 100644 --- a/CVE-2013-4356-xsa64.patch +++ b/52496c11-x86-mm-shadow-Fix-initialization-of-PV-shadow-L4-tables.patch @@ -1,31 +1,33 @@ References: bnc#840593 CVE-2013-4356 XSA-64 +# Commit f46befdd825c8a459c5eb21adb7d5b0dc6e30ad5 +# Date 2013-09-30 14:18:25 +0200 +# Author Tim Deegan +# Committer Jan Beulich x86/mm/shadow: Fix initialization of PV shadow L4 tables. - + Shadowed PV L4 tables must have the same Xen mappings as their unshadowed equivalent. This is done by copying the Xen entries verbatim from the idle pagetable, and then using guest_l4_slot() in the SHADOW_FOREACH_L4E() iterator to avoid touching those entries. - + adc5afbf1c70ef55c260fb93e4b8ce5ccb918706 (x86: support up to 16Tb) changed the definition of ROOT_PAGETABLE_XEN_SLOTS to extend right to the top of the address space, which causes the shadow code to copy Xen mappings into guest-kernel-address slots too. - + In the common case, all those slots are zero in the idle pagetable, and no harm is done. But if any slot above #271 is non-zero, Xen will crash when that slot is later cleared (it attempts to drop shadow-pagetable refcounts on its own L4 pagetables). - + Fix by using the new ROOT_PAGETABLE_PV_XEN_SLOTS when appropriate. Monitor pagetables need the full Xen mappings, so they keep using the old name (with its new semantics). This is CVE-2013-4356 / XSA-64. - -Reported-by: Andrew Cooper + Signed-off-by: Tim Deegan -Tested-by: Andrew Cooper Reviewed-by: Jan Beulich --- a/xen/arch/x86/mm/shadow/multi.c diff --git a/CVE-2013-4361-xsa66.patch b/52496c32-x86-properly-set-up-fbld-emulation-operand-address.patch similarity index 84% rename from CVE-2013-4361-xsa66.patch rename to 52496c32-x86-properly-set-up-fbld-emulation-operand-address.patch index aa80323..f65afa3 100644 --- a/CVE-2013-4361-xsa66.patch +++ b/52496c32-x86-properly-set-up-fbld-emulation-operand-address.patch @@ -1,5 +1,9 @@ References: bnc#841766 CVE-2013-4361 XSA-66 +# Commit 28b706efb6abb637fabfd74cde70a50935a5640b +# Date 2013-09-30 14:18:58 +0200 +# Author Jan Beulich +# Committer Jan Beulich x86: properly set up fbld emulation operand address This is CVE-2013-4361 / XSA-66. diff --git a/52497c6c-x86-don-t-blindly-create-L3-tables-for-the-direct-map.patch b/52497c6c-x86-don-t-blindly-create-L3-tables-for-the-direct-map.patch new file mode 100644 index 0000000..fe8318d --- /dev/null +++ b/52497c6c-x86-don-t-blindly-create-L3-tables-for-the-direct-map.patch @@ -0,0 +1,116 @@ +# Commit ca145fe70bad3a25ad54c6ded1ef237e45a2311e +# Date 2013-09-30 15:28:12 +0200 +# Author Jan Beulich +# Committer Jan Beulich +x86: don't blindly create L3 tables for the direct map + +Now that the direct map area can extend all the way up to almost the +end of address space, this is wasteful. + +Also fold two almost redundant messages in SRAT parsing into one. + +Signed-off-by: Jan Beulich +Tested-by: Malcolm Crossley +Reviewed-by: Andrew Cooper +Acked-by: Keir Fraser + +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -137,7 +137,7 @@ l1_pgentry_t __attribute__ ((__section__ + #define PTE_UPDATE_WITH_CMPXCHG + #endif + +-bool_t __read_mostly mem_hotplug = 0; ++paddr_t __read_mostly mem_hotplug; + + /* Private domain structs for DOMID_XEN and DOMID_IO. */ + struct domain *dom_xen, *dom_io, *dom_cow; +--- a/xen/arch/x86/srat.c ++++ b/xen/arch/x86/srat.c +@@ -113,6 +113,7 @@ static __init void bad_srat(void) + apicid_to_node[i] = NUMA_NO_NODE; + for (i = 0; i < ARRAY_SIZE(pxm2node); i++) + pxm2node[i] = NUMA_NO_NODE; ++ mem_hotplug = 0; + } + + /* +@@ -257,13 +258,6 @@ acpi_numa_memory_affinity_init(struct ac + return; + } + /* It is fine to add this area to the nodes data it will be used later*/ +- if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) +- { +- printk(KERN_INFO "SRAT: hot plug zone found %"PRIx64" - %"PRIx64" \n", +- start, end); +- mem_hotplug = 1; +- } +- + i = conflicting_memblks(start, end); + if (i == node) { + printk(KERN_WARNING +@@ -287,8 +281,11 @@ acpi_numa_memory_affinity_init(struct ac + if (nd->end < end) + nd->end = end; + } +- printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"\n", node, pxm, +- start, end); ++ if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && end > mem_hotplug) ++ mem_hotplug = end; ++ printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"%s\n", ++ node, pxm, start, end, ++ ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE ? " (hotplug)" : ""); + + node_memblk_range[num_node_memblks].start = start; + node_memblk_range[num_node_memblks].end = end; +--- a/xen/arch/x86/x86_64/mm.c ++++ b/xen/arch/x86/x86_64/mm.c +@@ -621,25 +621,20 @@ void __init paging_init(void) + * We setup the L3s for 1:1 mapping if host support memory hotplug + * to avoid sync the 1:1 mapping on page fault handler + */ +- if ( mem_hotplug ) ++ for ( va = DIRECTMAP_VIRT_START; ++ va < DIRECTMAP_VIRT_END && (void *)va < __va(mem_hotplug); ++ va += (1UL << L4_PAGETABLE_SHIFT) ) + { +- unsigned long va; +- +- for ( va = DIRECTMAP_VIRT_START; +- va < DIRECTMAP_VIRT_END; +- va += (1UL << L4_PAGETABLE_SHIFT) ) ++ if ( !(l4e_get_flags(idle_pg_table[l4_table_offset(va)]) & ++ _PAGE_PRESENT) ) + { +- if ( !(l4e_get_flags(idle_pg_table[l4_table_offset(va)]) & +- _PAGE_PRESENT) ) +- { +- l3_pg = alloc_domheap_page(NULL, 0); +- if ( !l3_pg ) +- goto nomem; +- l3_ro_mpt = page_to_virt(l3_pg); +- clear_page(l3_ro_mpt); +- l4e_write(&idle_pg_table[l4_table_offset(va)], +- l4e_from_page(l3_pg, __PAGE_HYPERVISOR)); +- } ++ l3_pg = alloc_domheap_page(NULL, 0); ++ if ( !l3_pg ) ++ goto nomem; ++ l3_ro_mpt = page_to_virt(l3_pg); ++ clear_page(l3_ro_mpt); ++ l4e_write(&idle_pg_table[l4_table_offset(va)], ++ l4e_from_page(l3_pg, __PAGE_HYPERVISOR)); + } + } + +--- a/xen/include/asm-x86/mm.h ++++ b/xen/include/asm-x86/mm.h +@@ -399,7 +399,7 @@ static inline int get_page_and_type(stru + int check_descriptor(const struct domain *, struct desc_struct *d); + + extern bool_t opt_allow_superpage; +-extern bool_t mem_hotplug; ++extern paddr_t mem_hotplug; + + /****************************************************************************** + * With shadow pagetables, the different kinds of address start diff --git a/524e971b-x86-idle-Fix-get_cpu_idle_time-s-interaction-with-offline-pcpus.patch b/524e971b-x86-idle-Fix-get_cpu_idle_time-s-interaction-with-offline-pcpus.patch new file mode 100644 index 0000000..24f0f16 --- /dev/null +++ b/524e971b-x86-idle-Fix-get_cpu_idle_time-s-interaction-with-offline-pcpus.patch @@ -0,0 +1,82 @@ +# Commit 0aa27ce3351f7eb09d13e863a1d5f303086aa32a +# Date 2013-10-04 12:23:23 +0200 +# Author Andrew Cooper +# Committer Jan Beulich +x86/idle: Fix get_cpu_idle_time()'s interaction with offline pcpus + +Checking for "idle_vcpu[cpu] != NULL" is insufficient protection against +offline pcpus. From a hypercall, vcpu_runstate_get() will determine "v != +current", and try to take the vcpu_schedule_lock(). This will try to look up +per_cpu(schedule_data, v->processor) and promptly suffer a NULL structure +deference as v->processors' __per_cpu_offset is INVALID_PERCPU_AREA. + +One example might look like this: + +... +Xen call trace: + [] vcpu_runstate_get+0x50/0x113 + [] get_cpu_idle_time+0x28/0x2e + [] do_sysctl+0x3db/0xeb8 + [] compat_hypercall+0xbd/0x116 + +Pagetable walk from 0000000000000040: + L4[0x000] = 0000000186df8027 0000000000028207 + L3[0x000] = 0000000188e36027 00000000000261c9 + L2[0x000] = 0000000000000000 ffffffffffffffff + +**************************************** +Panic on CPU 11: +... + +get_cpu_idle_time() has been updated to correctly deal with offline pcpus +itself by returning 0, in the same way as it would if it was missing the +idle_vcpu[] pointer. + +In doing so, XENPF_getidletime needed updating to correctly retain its +described behaviour of clearing bits in the cpumap for offline pcpus. + +As this crash can only be triggered with toolstack hypercalls, it is not a +security issue and just a simple bug. + +Signed-off-by: Andrew Cooper +Acked-by: Keir Fraser + +--- a/xen/arch/x86/platform_hypercall.c ++++ b/xen/arch/x86/platform_hypercall.c +@@ -355,10 +355,14 @@ ret_t do_platform_op(XEN_GUEST_HANDLE_PA + + for_each_cpu ( cpu, cpumap ) + { +- if ( idle_vcpu[cpu] == NULL ) +- cpumask_clear_cpu(cpu, cpumap); + idletime = get_cpu_idle_time(cpu); + ++ if ( !idletime ) ++ { ++ cpumask_clear_cpu(cpu, cpumap); ++ continue; ++ } ++ + if ( copy_to_guest_offset(idletimes, cpu, &idletime, 1) ) + { + ret = -EFAULT; +--- a/xen/common/schedule.c ++++ b/xen/common/schedule.c +@@ -176,13 +176,12 @@ void vcpu_runstate_get(struct vcpu *v, s + + uint64_t get_cpu_idle_time(unsigned int cpu) + { +- struct vcpu_runstate_info state; +- struct vcpu *v; ++ struct vcpu_runstate_info state = { 0 }; ++ struct vcpu *v = idle_vcpu[cpu]; + +- if ( (v = idle_vcpu[cpu]) == NULL ) +- return 0; ++ if ( cpu_online(cpu) && v ) ++ vcpu_runstate_get(v, &state); + +- vcpu_runstate_get(v, &state); + return state.time[RUNSTATE_running]; + } + diff --git a/524e9762-x86-percpu-Force-INVALID_PERCPU_AREA-to-non-canonical.patch b/524e9762-x86-percpu-Force-INVALID_PERCPU_AREA-to-non-canonical.patch new file mode 100644 index 0000000..7102ff3 --- /dev/null +++ b/524e9762-x86-percpu-Force-INVALID_PERCPU_AREA-to-non-canonical.patch @@ -0,0 +1,35 @@ +# Commit 7cfb0053629c4dd1a6f01dc43cca7c0c25b8b7bf +# Date 2013-10-04 12:24:34 +0200 +# Author Andrew Cooper +# Committer Jan Beulich +x86/percpu: Force INVALID_PERCPU_AREA into the non-canonical address region + +This causes accidental uses of per_cpu() on a pcpu with an INVALID_PERCPU_AREA +to result in a #GF for attempting to access the middle of the non-canonical +virtual address region. + +This is preferable to the current behaviour, where incorrect use of per_cpu() +will result in an effective NULL structure dereference which has security +implication in the context of PV guests. + +Signed-off-by: Andrew Cooper +Acked-by: Keir Fraser + +--- a/xen/arch/x86/percpu.c ++++ b/xen/arch/x86/percpu.c +@@ -6,7 +6,14 @@ + #include + + unsigned long __per_cpu_offset[NR_CPUS]; +-#define INVALID_PERCPU_AREA (-(long)__per_cpu_start) ++ ++/* ++ * Force uses of per_cpu() with an invalid area to attempt to access the ++ * middle of the non-canonical address space resulting in a #GP, rather than a ++ * possible #PF at (NULL + a little) which has security implications in the ++ * context of PV guests. ++ */ ++#define INVALID_PERCPU_AREA (0x8000000000000000L - (long)__per_cpu_start) + #define PERCPU_ORDER (get_order_from_bytes(__per_cpu_data_end-__per_cpu_start)) + + void __init percpu_init_areas(void) diff --git a/524e983e-Nested-VMX-check-VMX-capability-before-read-VMX-related-MSRs.patch b/524e983e-Nested-VMX-check-VMX-capability-before-read-VMX-related-MSRs.patch new file mode 100644 index 0000000..1532797 --- /dev/null +++ b/524e983e-Nested-VMX-check-VMX-capability-before-read-VMX-related-MSRs.patch @@ -0,0 +1,82 @@ +# Commit 190b667ac20e8175758f4a3a0f13c4d990e6af7e +# Date 2013-10-04 12:28:14 +0200 +# Author Yang Zhang +# Committer Jan Beulich +Nested VMX: check VMX capability before read VMX related MSRs + +VMX MSRs only available when the CPU support the VMX feature. In addition, +VMX_TRUE* MSRs only available when bit 55 of VMX_BASIC MSR is set. + +Signed-off-by: Yang Zhang + +Cleanup. + +Signed-off-by: Jan Beulich +Acked-by: Jun Nakajima + +--- a/xen/arch/x86/hvm/vmx/vmcs.c ++++ b/xen/arch/x86/hvm/vmx/vmcs.c +@@ -78,6 +78,7 @@ static DEFINE_PER_CPU(struct list_head, + static DEFINE_PER_CPU(bool_t, vmxon); + + static u32 vmcs_revision_id __read_mostly; ++u64 __read_mostly vmx_basic_msr; + + static void __init vmx_display_features(void) + { +@@ -301,6 +302,8 @@ static int vmx_init_vmcs_config(void) + vmx_vmexit_control = _vmx_vmexit_control; + vmx_vmentry_control = _vmx_vmentry_control; + cpu_has_vmx_ins_outs_instr_info = !!(vmx_basic_msr_high & (1U<<22)); ++ vmx_basic_msr = ((u64)vmx_basic_msr_high << 32) | ++ vmx_basic_msr_low; + vmx_display_features(); + } + else +--- a/xen/arch/x86/hvm/vmx/vvmx.c ++++ b/xen/arch/x86/hvm/vmx/vvmx.c +@@ -1814,12 +1814,33 @@ int nvmx_handle_invvpid(struct cpu_user_ + int nvmx_msr_read_intercept(unsigned int msr, u64 *msr_content) + { + struct vcpu *v = current; ++ unsigned int ecx, dummy; + u64 data = 0, host_data = 0; + int r = 1; + + if ( !nestedhvm_enabled(v->domain) ) + return 0; + ++ /* VMX capablity MSRs are available only when guest supports VMX. */ ++ hvm_cpuid(0x1, &dummy, &dummy, &ecx, &dummy); ++ if ( !(ecx & cpufeat_mask(X86_FEATURE_VMXE)) ) ++ return 0; ++ ++ /* ++ * Those MSRs are available only when bit 55 of ++ * MSR_IA32_VMX_BASIC is set. ++ */ ++ switch ( msr ) ++ { ++ case MSR_IA32_VMX_TRUE_PINBASED_CTLS: ++ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: ++ case MSR_IA32_VMX_TRUE_EXIT_CTLS: ++ case MSR_IA32_VMX_TRUE_ENTRY_CTLS: ++ if ( !(vmx_basic_msr & VMX_BASIC_DEFAULT1_ZERO) ) ++ return 0; ++ break; ++ } ++ + rdmsrl(msr, host_data); + + /* +--- a/xen/include/asm-x86/hvm/vmx/vmcs.h ++++ b/xen/include/asm-x86/hvm/vmx/vmcs.h +@@ -284,6 +284,8 @@ extern bool_t cpu_has_vmx_ins_outs_instr + */ + #define VMX_BASIC_DEFAULT1_ZERO (1ULL << 55) + ++extern u64 vmx_basic_msr; ++ + /* Guest interrupt status */ + #define VMX_GUEST_INTR_STATUS_SUBFIELD_BITMASK 0x0FF + #define VMX_GUEST_INTR_STATUS_SVI_OFFSET 8 diff --git a/524e98b1-Nested-VMX-fix-IA32_VMX_CR4_FIXED1-msr-emulation.patch b/524e98b1-Nested-VMX-fix-IA32_VMX_CR4_FIXED1-msr-emulation.patch new file mode 100644 index 0000000..f4dd99f --- /dev/null +++ b/524e98b1-Nested-VMX-fix-IA32_VMX_CR4_FIXED1-msr-emulation.patch @@ -0,0 +1,115 @@ +# Commit c6f92aed0e209df823d2cb5780dbb1ea12fc6d4a +# Date 2013-10-04 12:30:09 +0200 +# Author Yang Zhang +# Committer Jan Beulich +Nested VMX: fix IA32_VMX_CR4_FIXED1 msr emulation + +Currently, it use hardcode value for IA32_VMX_CR4_FIXED1. This is wrong. +We should check guest's cpuid to know which bits are writeable in CR4 by guest +and allow the guest to set the corresponding bit only when guest has the feature. + +Signed-off-by: Yang Zhang + +Cleanup. + +Signed-off-by: Jan Beulich +Acked-by: Jun Nakajima + +--- a/xen/arch/x86/hvm/vmx/vvmx.c ++++ b/xen/arch/x86/hvm/vmx/vvmx.c +@@ -1814,7 +1814,7 @@ int nvmx_handle_invvpid(struct cpu_user_ + int nvmx_msr_read_intercept(unsigned int msr, u64 *msr_content) + { + struct vcpu *v = current; +- unsigned int ecx, dummy; ++ unsigned int eax, ebx, ecx, edx, dummy; + u64 data = 0, host_data = 0; + int r = 1; + +@@ -1822,7 +1822,7 @@ int nvmx_msr_read_intercept(unsigned int + return 0; + + /* VMX capablity MSRs are available only when guest supports VMX. */ +- hvm_cpuid(0x1, &dummy, &dummy, &ecx, &dummy); ++ hvm_cpuid(0x1, &dummy, &dummy, &ecx, &edx); + if ( !(ecx & cpufeat_mask(X86_FEATURE_VMXE)) ) + return 0; + +@@ -1946,8 +1946,55 @@ int nvmx_msr_read_intercept(unsigned int + data = X86_CR4_VMXE; + break; + case MSR_IA32_VMX_CR4_FIXED1: +- /* allow 0-settings except SMXE */ +- data = 0x267ff & ~X86_CR4_SMXE; ++ if ( edx & cpufeat_mask(X86_FEATURE_VME) ) ++ data |= X86_CR4_VME | X86_CR4_PVI; ++ if ( edx & cpufeat_mask(X86_FEATURE_TSC) ) ++ data |= X86_CR4_TSD; ++ if ( edx & cpufeat_mask(X86_FEATURE_DE) ) ++ data |= X86_CR4_DE; ++ if ( edx & cpufeat_mask(X86_FEATURE_PSE) ) ++ data |= X86_CR4_PSE; ++ if ( edx & cpufeat_mask(X86_FEATURE_PAE) ) ++ data |= X86_CR4_PAE; ++ if ( edx & cpufeat_mask(X86_FEATURE_MCE) ) ++ data |= X86_CR4_MCE; ++ if ( edx & cpufeat_mask(X86_FEATURE_PGE) ) ++ data |= X86_CR4_PGE; ++ if ( edx & cpufeat_mask(X86_FEATURE_FXSR) ) ++ data |= X86_CR4_OSFXSR; ++ if ( edx & cpufeat_mask(X86_FEATURE_XMM) ) ++ data |= X86_CR4_OSXMMEXCPT; ++ if ( ecx & cpufeat_mask(X86_FEATURE_VMXE) ) ++ data |= X86_CR4_VMXE; ++ if ( ecx & cpufeat_mask(X86_FEATURE_SMXE) ) ++ data |= X86_CR4_SMXE; ++ if ( ecx & cpufeat_mask(X86_FEATURE_PCID) ) ++ data |= X86_CR4_PCIDE; ++ if ( ecx & cpufeat_mask(X86_FEATURE_XSAVE) ) ++ data |= X86_CR4_OSXSAVE; ++ ++ hvm_cpuid(0x0, &eax, &dummy, &dummy, &dummy); ++ switch ( eax ) ++ { ++ default: ++ hvm_cpuid(0xa, &eax, &dummy, &dummy, &dummy); ++ /* Check whether guest has the perf monitor feature. */ ++ if ( (eax & 0xff) && (eax & 0xff00) ) ++ data |= X86_CR4_PCE; ++ /* fall through */ ++ case 0x7 ... 0x9: ++ ecx = 0; ++ hvm_cpuid(0x7, &dummy, &ebx, &ecx, &dummy); ++ if ( ebx & cpufeat_mask(X86_FEATURE_FSGSBASE) ) ++ data |= X86_CR4_FSGSBASE; ++ if ( ebx & cpufeat_mask(X86_FEATURE_SMEP) ) ++ data |= X86_CR4_SMEP; ++ if ( ebx & cpufeat_mask(X86_FEATURE_SMAP) ) ++ data |= X86_CR4_SMAP; ++ /* fall through */ ++ case 0x0 ... 0x6: ++ break; ++ } + break; + case MSR_IA32_VMX_MISC: + /* Do not support CR3-target feature now */ +--- a/xen/include/asm-x86/cpufeature.h ++++ b/xen/include/asm-x86/cpufeature.h +@@ -148,6 +148,7 @@ + #define X86_FEATURE_INVPCID (7*32+10) /* Invalidate Process Context ID */ + #define X86_FEATURE_RTM (7*32+11) /* Restricted Transactional Memory */ + #define X86_FEATURE_NO_FPU_SEL (7*32+13) /* FPU CS/DS stored as zero */ ++#define X86_FEATURE_SMAP (7*32+20) /* Supervisor Mode Access Prevention */ + + #define cpu_has(c, bit) test_bit(bit, (c)->x86_capability) + #define boot_cpu_has(bit) test_bit(bit, boot_cpu_data.x86_capability) +--- a/xen/include/asm-x86/processor.h ++++ b/xen/include/asm-x86/processor.h +@@ -87,6 +87,7 @@ + #define X86_CR4_PCIDE 0x20000 /* enable PCID */ + #define X86_CR4_OSXSAVE 0x40000 /* enable XSAVE/XRSTOR */ + #define X86_CR4_SMEP 0x100000/* enable SMEP */ ++#define X86_CR4_SMAP 0x200000/* enable SMAP */ + + /* + * Trap/fault mnemonics. diff --git a/524e9dc0-xsm-forbid-PV-guest-console-reads.patch b/524e9dc0-xsm-forbid-PV-guest-console-reads.patch new file mode 100644 index 0000000..fac4aa4 --- /dev/null +++ b/524e9dc0-xsm-forbid-PV-guest-console-reads.patch @@ -0,0 +1,28 @@ +# Commit 65ba631bcb62c79eb33ebfde8a0471fd012c37a8 +# Date 2013-10-04 12:51:44 +0200 +# Author Daniel De Graaf +# Committer Jan Beulich +xsm: forbid PV guest console reads + +The CONSOLEIO_read operation was incorrectly allowed to PV guests if the +hypervisor was compiled in debug mode (with VERBOSE defined). + +Reported-by: Jan Beulich +Signed-off-by: Daniel De Graaf + +--- a/xen/include/xsm/dummy.h ++++ b/xen/include/xsm/dummy.h +@@ -222,10 +222,10 @@ static XSM_INLINE int xsm_console_io(XSM + { + XSM_ASSERT_ACTION(XSM_OTHER); + #ifdef VERBOSE +- return xsm_default_action(XSM_HOOK, current->domain, NULL); +-#else +- return xsm_default_action(XSM_PRIV, current->domain, NULL); ++ if ( cmd == CONSOLEIO_write ) ++ return xsm_default_action(XSM_HOOK, d, NULL); + #endif ++ return xsm_default_action(XSM_PRIV, d, NULL); + } + + static XSM_INLINE int xsm_profile(XSM_DEFAULT_ARG struct domain *d, int op) diff --git a/5256a979-x86-check-segment-descriptor-read-result-in-64-bit-OUTS-emulation.patch b/5256a979-x86-check-segment-descriptor-read-result-in-64-bit-OUTS-emulation.patch new file mode 100644 index 0000000..b298a33 --- /dev/null +++ b/5256a979-x86-check-segment-descriptor-read-result-in-64-bit-OUTS-emulation.patch @@ -0,0 +1,43 @@ +References: bnc#842511 CVE-2013-4368 XSA-67 + +# Commit 0771faba163769089c9f05f7f76b63e397677613 +# Date 2013-10-10 15:19:53 +0200 +# Author Matthew Daley +# Committer Jan Beulich +x86: check segment descriptor read result in 64-bit OUTS emulation + +When emulating such an operation from a 64-bit context (CS has long +mode set), and the data segment is overridden to FS/GS, the result of +reading the overridden segment's descriptor (read_descriptor) is not +checked. If it fails, data_base is left uninitialized. + +This can lead to 8 bytes of Xen's stack being leaked to the guest +(implicitly, i.e. via the address given in a #PF). + +Coverity-ID: 1055116 + +This is CVE-2013-4368 / XSA-67. + +Signed-off-by: Matthew Daley + +Fix formatting. + +Signed-off-by: Jan Beulich + +--- a/xen/arch/x86/traps.c ++++ b/xen/arch/x86/traps.c +@@ -1990,10 +1990,10 @@ static int emulate_privileged_op(struct + break; + } + } +- else +- read_descriptor(data_sel, v, regs, +- &data_base, &data_limit, &ar, +- 0); ++ else if ( !read_descriptor(data_sel, v, regs, ++ &data_base, &data_limit, &ar, 0) || ++ !(ar & _SEGMENT_S) || !(ar & _SEGMENT_P) ) ++ goto fail; + data_limit = ~0UL; + ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P; + } diff --git a/5256be57-libxl-fix-vif-rate-parsing.patch b/5256be57-libxl-fix-vif-rate-parsing.patch new file mode 100644 index 0000000..c9f16b3 --- /dev/null +++ b/5256be57-libxl-fix-vif-rate-parsing.patch @@ -0,0 +1,71 @@ +References: bnc#842512 CVE-2013-4369 XSA-68 + +# Commit c53702cee1d6f9f1b72f0cae0b412e21bcda8724 +# Date 2013-10-10 15:48:55 +0100 +# Author Ian Jackson +# Committer Ian Jackson +libxl: fix vif rate parsing + +strtok can return NULL here. We don't need to use strtok anyway, so just +use a simple strchr method. + +Coverity-ID: 1055642 + +This is CVE-2013-4369 / XSA-68 + +Signed-off-by: Matthew Daley + +Fix type. Add test case + +Signed-off-by: Ian Campbell + +--- a/tools/libxl/check-xl-vif-parse ++++ b/tools/libxl/check-xl-vif-parse +@@ -206,4 +206,8 @@ expected +# Committer Ian Jackson +tools/ocaml: fix erroneous free of cpumap in stub_xc_vcpu_getaffinity + +Not sure how it got there... + +Coverity-ID: 1056196 + +This is CVE-2013-4370 / XSA-69 + +Signed-off-by: Matthew Daley +Acked-by: Ian Campbell + +--- a/tools/ocaml/libs/xc/xenctrl_stubs.c ++++ b/tools/ocaml/libs/xc/xenctrl_stubs.c +@@ -461,8 +461,6 @@ CAMLprim value stub_xc_vcpu_getaffinity( + + retval = xc_vcpu_getaffinity(_H(xch), _D(domid), + Int_val(vcpu), c_cpumap); +- free(c_cpumap); +- + if (retval < 0) { + free(c_cpumap); + failwith_xc(_H(xch)); diff --git a/5256be92-libxl-fix-out-of-memory-error-handling-in-libxl_list_cpupool.patch b/5256be92-libxl-fix-out-of-memory-error-handling-in-libxl_list_cpupool.patch new file mode 100644 index 0000000..5585156 --- /dev/null +++ b/5256be92-libxl-fix-out-of-memory-error-handling-in-libxl_list_cpupool.patch @@ -0,0 +1,28 @@ +References: bnc#842514 CVE-2013-4371 XSA-70 + +# Commit 4c37ed562224295c0f8b00211287d57cae629782 +# Date 2013-10-10 15:49:54 +0100 +# Author Matthew Daley +# Committer Ian Jackson +libxl: fix out-of-memory error handling in libxl_list_cpupool + +...otherwise it will return freed memory. All the current users of this +function check already for a NULL return, so use that. + +Coverity-ID: 1056194 + +This is CVE-2013-4371 / XSA-70 + +Signed-off-by: Matthew Daley +Acked-by: Ian Campbell + +--- a/tools/libxl/libxl.c ++++ b/tools/libxl/libxl.c +@@ -649,6 +649,7 @@ libxl_cpupoolinfo * libxl_list_cpupool(l + if (!tmp) { + LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "allocating cpupool info"); + libxl_cpupoolinfo_list_free(ptr, i); ++ ptr = NULL; + goto out; + } + ptr = tmp; diff --git a/5257a89a-x86-correct-LDT-checks.patch b/5257a89a-x86-correct-LDT-checks.patch new file mode 100644 index 0000000..b0bb2bf --- /dev/null +++ b/5257a89a-x86-correct-LDT-checks.patch @@ -0,0 +1,176 @@ +# Commit 40d66baa46ca8a9ffa6df3e063a967d08ec92bcf +# Date 2013-10-11 09:28:26 +0200 +# Author Jan Beulich +# Committer Jan Beulich +x86: correct LDT checks + +- MMUEXT_SET_LDT should behave as similarly to the LLDT instruction as + possible: fail only if the base address is non-canonical +- instead LDT descriptor accesses should fault if the descriptor + address ends up being non-canonical (by ensuring this we at once + avoid reading an entry from the mach-to-phys table and consider it a + page table entry) +- fault propagation on using LDT selectors must distinguish #PF and #GP + (the latter must be raised for a non-canonical descriptor address, + which also applies to several other uses of propagate_page_fault(), + and hence the problem is being fixed there) +- map_ldt_shadow_page() should properly wrap addresses for 32-bit VMs + +At once remove the odd invokation of map_ldt_shadow_page() from the +MMUEXT_SET_LDT handler: There's nothing really telling us that the +first LDT page is going to be preferred over others. + +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper +Acked-by: Keir Fraser + +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -674,12 +674,7 @@ int arch_set_info_guest( + fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs); + } + +- /* LDT safety checks. */ +- if ( ((c.nat->ldt_base & (PAGE_SIZE-1)) != 0) || +- (c.nat->ldt_ents > 8192) || +- !array_access_ok(c.nat->ldt_base, +- c.nat->ldt_ents, +- LDT_ENTRY_SIZE) ) ++ if ( !__addr_ok(c.nat->ldt_base) ) + return -EINVAL; + } + else +@@ -692,15 +687,12 @@ int arch_set_info_guest( + + for ( i = 0; i < ARRAY_SIZE(c.cmp->trap_ctxt); i++ ) + fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs); +- +- /* LDT safety checks. */ +- if ( ((c.cmp->ldt_base & (PAGE_SIZE-1)) != 0) || +- (c.cmp->ldt_ents > 8192) || +- !compat_array_access_ok(c.cmp->ldt_base, +- c.cmp->ldt_ents, +- LDT_ENTRY_SIZE) ) +- return -EINVAL; + } ++ ++ /* LDT safety checks. */ ++ if ( ((c(ldt_base) & (PAGE_SIZE - 1)) != 0) || ++ (c(ldt_ents) > 8192) ) ++ return -EINVAL; + } + + v->fpu_initialised = !!(flags & VGCF_I387_VALID); +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -582,6 +582,8 @@ int map_ldt_shadow_page(unsigned int off + + BUG_ON(unlikely(in_irq())); + ++ if ( is_pv_32bit_domain(d) ) ++ gva = (u32)gva; + guest_get_eff_kern_l1e(v, gva, &l1e); + if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) ) + return 0; +@@ -3229,9 +3231,8 @@ long do_mmuext_op( + MEM_LOG("ignoring SET_LDT hypercall from external domain"); + okay = 0; + } +- else if ( ((ptr & (PAGE_SIZE-1)) != 0) || +- (ents > 8192) || +- !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) ) ++ else if ( ((ptr & (PAGE_SIZE - 1)) != 0) || !__addr_ok(ptr) || ++ (ents > 8192) ) + { + okay = 0; + MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents); +@@ -3244,8 +3245,6 @@ long do_mmuext_op( + curr->arch.pv_vcpu.ldt_base = ptr; + curr->arch.pv_vcpu.ldt_ents = ents; + load_LDT(curr); +- if ( ents != 0 ) +- (void)map_ldt_shadow_page(0); + } + break; + } +--- a/xen/arch/x86/traps.c ++++ b/xen/arch/x86/traps.c +@@ -1070,12 +1070,24 @@ static void reserved_bit_page_fault( + show_execution_state(regs); + } + +-void propagate_page_fault(unsigned long addr, u16 error_code) ++struct trap_bounce *propagate_page_fault(unsigned long addr, u16 error_code) + { + struct trap_info *ti; + struct vcpu *v = current; + struct trap_bounce *tb = &v->arch.pv_vcpu.trap_bounce; + ++ if ( unlikely(!is_canonical_address(addr)) ) ++ { ++ ti = &v->arch.pv_vcpu.trap_ctxt[TRAP_gp_fault]; ++ tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE; ++ tb->error_code = 0; ++ tb->cs = ti->cs; ++ tb->eip = ti->address; ++ if ( TI_GET_IF(ti) ) ++ tb->flags |= TBF_INTERRUPT; ++ return tb; ++ } ++ + v->arch.pv_vcpu.ctrlreg[2] = addr; + arch_set_cr2(v, addr); + +@@ -1102,6 +1114,8 @@ void propagate_page_fault(unsigned long + + if ( unlikely(error_code & PFEC_reserved_bit) ) + reserved_bit_page_fault(addr, guest_cpu_user_regs()); ++ ++ return NULL; + } + + static int handle_gdt_ldt_mapping_fault( +@@ -1135,13 +1149,16 @@ static int handle_gdt_ldt_mapping_fault( + } + else + { ++ struct trap_bounce *tb; ++ + /* In hypervisor mode? Leave it to the #PF handler to fix up. */ + if ( !guest_mode(regs) ) + return 0; +- /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */ +- propagate_page_fault( +- curr->arch.pv_vcpu.ldt_base + offset, +- regs->error_code); ++ /* In guest mode? Propagate fault to guest, with adjusted %cr2. */ ++ tb = propagate_page_fault(curr->arch.pv_vcpu.ldt_base + offset, ++ regs->error_code); ++ if ( tb ) ++ tb->error_code = ((u16)offset & ~3) | 4; + } + } + else +--- a/xen/include/asm-x86/mm.h ++++ b/xen/include/asm-x86/mm.h +@@ -555,7 +555,7 @@ int new_guest_cr3(unsigned long pfn); + void make_cr3(struct vcpu *v, unsigned long mfn); + void update_cr3(struct vcpu *v); + int vcpu_destroy_pagetables(struct vcpu *); +-void propagate_page_fault(unsigned long addr, u16 error_code); ++struct trap_bounce *propagate_page_fault(unsigned long addr, u16 error_code); + void *do_page_walk(struct vcpu *v, unsigned long addr); + + int __sync_local_execstate(void); +--- a/xen/include/asm-x86/paging.h ++++ b/xen/include/asm-x86/paging.h +@@ -386,7 +386,8 @@ guest_get_eff_l1e(struct vcpu *v, unsign + if ( likely(!paging_mode_translate(v->domain)) ) + { + ASSERT(!paging_mode_external(v->domain)); +- if ( __copy_from_user(eff_l1e, ++ if ( !__addr_ok(addr) || ++ __copy_from_user(eff_l1e, + &__linear_l1_table[l1_linear_offset(addr)], + sizeof(l1_pgentry_t)) != 0 ) + *(l1_pgentry_t *)eff_l1e = l1e_empty(); diff --git a/5257a8e7-x86-add-address-validity-check-to-guest_map_l1e.patch b/5257a8e7-x86-add-address-validity-check-to-guest_map_l1e.patch new file mode 100644 index 0000000..1a99973 --- /dev/null +++ b/5257a8e7-x86-add-address-validity-check-to-guest_map_l1e.patch @@ -0,0 +1,26 @@ +# Commit d06a0d715ec1423b6c42141ab1b0ff69a3effb56 +# Date 2013-10-11 09:29:43 +0200 +# Author Jan Beulich +# Committer Jan Beulich +x86: add address validity check to guest_map_l1e() + +Just like for guest_get_eff_l1e() this prevents accessing as page +tables (and with the wrong memory attribute) internal data inside Xen +happening to be mapped with 1Gb pages. + +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper +Acked-by: Keir Fraser + +--- a/xen/include/asm-x86/paging.h ++++ b/xen/include/asm-x86/paging.h +@@ -360,7 +360,8 @@ guest_map_l1e(struct vcpu *v, unsigned l + return paging_get_hostmode(v)->guest_map_l1e(v, addr, gl1mfn); + + /* Find this l1e and its enclosing l1mfn in the linear map */ +- if ( __copy_from_user(&l2e, ++ if ( !__addr_ok(addr) || ++ __copy_from_user(&l2e, + &__linear_l2_table[l2_linear_offset(addr)], + sizeof(l2_pgentry_t)) != 0 ) + return NULL; diff --git a/5257a944-x86-check-for-canonical-address-before-doing-page-walks.patch b/5257a944-x86-check-for-canonical-address-before-doing-page-walks.patch new file mode 100644 index 0000000..1720d85 --- /dev/null +++ b/5257a944-x86-check-for-canonical-address-before-doing-page-walks.patch @@ -0,0 +1,38 @@ +# Commit 6fd9b0361e2eb5a7f12bdd5cbf7e42c0d1937d26 +# Date 2013-10-11 09:31:16 +0200 +# Author Jan Beulich +# Committer Jan Beulich +x86: check for canonical address before doing page walks + +... as there doesn't really exists any valid mapping for them. + +Particularly in the case of do_page_walk() this also avoids returning +non-NULL for such invalid input. + +Suggested-by: Andrew Cooper +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper +Acked-by: Keir Fraser + +--- a/xen/arch/x86/x86_64/mm.c ++++ b/xen/arch/x86/x86_64/mm.c +@@ -135,7 +135,7 @@ void *do_page_walk(struct vcpu *v, unsig + l2_pgentry_t l2e, *l2t; + l1_pgentry_t l1e, *l1t; + +- if ( is_hvm_vcpu(v) ) ++ if ( is_hvm_vcpu(v) || !is_canonical_address(addr) ) + return NULL; + + l4t = map_domain_page(mfn); +--- a/xen/arch/x86/x86_64/traps.c ++++ b/xen/arch/x86/x86_64/traps.c +@@ -169,6 +169,8 @@ void show_page_walk(unsigned long addr) + l1_pgentry_t l1e, *l1t; + + printk("Pagetable walk from %016lx:\n", addr); ++ if ( !is_canonical_address(addr) ) ++ return; + + l4t = map_domain_page(mfn); + l4e = l4t[l4_table_offset(addr)]; diff --git a/525b95f4-scheduler-adjust-internal-locking-interface.patch b/525b95f4-scheduler-adjust-internal-locking-interface.patch new file mode 100644 index 0000000..e295b8d --- /dev/null +++ b/525b95f4-scheduler-adjust-internal-locking-interface.patch @@ -0,0 +1,632 @@ +# Commit eedd60391610629b4e8a2e8278b857ff884f750d +# Date 2013-10-14 08:57:56 +0200 +# Author Jan Beulich +# Committer Jan Beulich +scheduler: adjust internal locking interface + +Make the locking functions return the lock pointers, so they can be +passed to the unlocking functions (which in turn can check that the +lock is still actually providing the intended protection, i.e. the +parameters determining which lock is the right one didn't change). + +Further use proper spin lock primitives rather than open coded +local_irq_...() constructs, so that interrupts can be re-enabled as +appropriate while spinning. + +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper +Acked-by: Keir Fraser + +--- a/xen/common/sched_credit.c ++++ b/xen/common/sched_credit.c +@@ -1170,6 +1170,7 @@ csched_runq_sort(struct csched_private * + struct csched_pcpu * const spc = CSCHED_PCPU(cpu); + struct list_head *runq, *elem, *next, *last_under; + struct csched_vcpu *svc_elem; ++ spinlock_t *lock; + unsigned long flags; + int sort_epoch; + +@@ -1179,7 +1180,7 @@ csched_runq_sort(struct csched_private * + + spc->runq_sort_last = sort_epoch; + +- pcpu_schedule_lock_irqsave(cpu, flags); ++ lock = pcpu_schedule_lock_irqsave(cpu, &flags); + + runq = &spc->runq; + elem = runq->next; +@@ -1204,7 +1205,7 @@ csched_runq_sort(struct csched_private * + elem = next; + } + +- pcpu_schedule_unlock_irqrestore(cpu, flags); ++ pcpu_schedule_unlock_irqrestore(lock, flags, cpu); + } + + static void +@@ -1568,7 +1569,9 @@ csched_load_balance(struct csched_privat + * could cause a deadlock if the peer CPU is also load + * balancing and trying to lock this CPU. + */ +- if ( !pcpu_schedule_trylock(peer_cpu) ) ++ spinlock_t *lock = pcpu_schedule_trylock(peer_cpu); ++ ++ if ( !lock ) + { + SCHED_STAT_CRANK(steal_trylock_failed); + peer_cpu = cpumask_cycle(peer_cpu, &workers); +@@ -1578,7 +1581,7 @@ csched_load_balance(struct csched_privat + /* Any work over there to steal? */ + speer = cpumask_test_cpu(peer_cpu, online) ? + csched_runq_steal(peer_cpu, cpu, snext->pri, bstep) : NULL; +- pcpu_schedule_unlock(peer_cpu); ++ pcpu_schedule_unlock(lock, peer_cpu); + + /* As soon as one vcpu is found, balancing ends */ + if ( speer != NULL ) +--- a/xen/common/sched_credit2.c ++++ b/xen/common/sched_credit2.c +@@ -881,15 +881,17 @@ csched_vcpu_insert(const struct schedule + */ + if ( ! is_idle_vcpu(vc) ) + { ++ spinlock_t *lock; ++ + /* FIXME: Do we need the private lock here? */ + list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu); + + /* Add vcpu to runqueue of initial processor */ +- vcpu_schedule_lock_irq(vc); ++ lock = vcpu_schedule_lock_irq(vc); + + runq_assign(ops, vc); + +- vcpu_schedule_unlock_irq(vc); ++ vcpu_schedule_unlock_irq(lock, vc); + + sdom->nr_vcpus++; + } +@@ -916,14 +918,16 @@ csched_vcpu_remove(const struct schedule + + if ( ! is_idle_vcpu(vc) ) + { ++ spinlock_t *lock; ++ + SCHED_STAT_CRANK(vcpu_destroy); + + /* Remove from runqueue */ +- vcpu_schedule_lock_irq(vc); ++ lock = vcpu_schedule_lock_irq(vc); + + runq_deassign(ops, vc); + +- vcpu_schedule_unlock_irq(vc); ++ vcpu_schedule_unlock_irq(lock, vc); + + /* Remove from sdom list. Don't need a lock for this, as it's called + * syncronously when nothing else can happen. */ +@@ -1010,8 +1014,7 @@ csched_context_saved(const struct schedu + { + struct csched_vcpu * const svc = CSCHED_VCPU(vc); + s_time_t now = NOW(); +- +- vcpu_schedule_lock_irq(vc); ++ spinlock_t *lock = vcpu_schedule_lock_irq(vc); + + BUG_ON( !is_idle_vcpu(vc) && svc->rqd != RQD(ops, vc->processor)); + +@@ -1037,7 +1040,7 @@ csched_context_saved(const struct schedu + else if ( !is_idle_vcpu(vc) ) + update_load(ops, svc->rqd, svc, -1, now); + +- vcpu_schedule_unlock_irq(vc); ++ vcpu_schedule_unlock_irq(lock, vc); + } + + #define MAX_LOAD (1ULL<<60); +@@ -1454,14 +1457,14 @@ csched_dom_cntl( + * must never lock csched_priv.lock if we're holding a runqueue lock. + * Also, calling vcpu_schedule_lock() is enough, since IRQs have already + * been disabled. */ +- vcpu_schedule_lock(svc->vcpu); ++ spinlock_t *lock = vcpu_schedule_lock(svc->vcpu); + + BUG_ON(svc->rqd != RQD(ops, svc->vcpu->processor)); + + svc->weight = sdom->weight; + update_max_weight(svc->rqd, svc->weight, old_weight); + +- vcpu_schedule_unlock(svc->vcpu); ++ vcpu_schedule_unlock(lock, svc->vcpu); + } + } + } +@@ -1991,6 +1994,7 @@ static void init_pcpu(const struct sched + cpumask_set_cpu(cpu, &rqd->idle); + cpumask_set_cpu(cpu, &rqd->active); + ++ /* _Not_ pcpu_schedule_unlock(): per_cpu().schedule_lock changed! */ + spin_unlock(old_lock); + + cpumask_set_cpu(cpu, &prv->initialized); +--- a/xen/common/sched_sedf.c ++++ b/xen/common/sched_sedf.c +@@ -1350,14 +1350,16 @@ static int sedf_adjust_weights(struct cp + if ( EDOM_INFO(p)->weight ) + { + /* Interrupts already off */ +- vcpu_schedule_lock(p); ++ spinlock_t *lock = vcpu_schedule_lock(p); ++ + EDOM_INFO(p)->period_orig = + EDOM_INFO(p)->period = WEIGHT_PERIOD; + EDOM_INFO(p)->slice_orig = + EDOM_INFO(p)->slice = + (EDOM_INFO(p)->weight * + (WEIGHT_PERIOD - WEIGHT_SAFETY - sumt[cpu])) / sumw[cpu]; +- vcpu_schedule_unlock(p); ++ ++ vcpu_schedule_unlock(lock, p); + } + } + } +@@ -1418,21 +1420,24 @@ static int sedf_adjust(const struct sche + { + /* (Here and everywhere in the following) IRQs are already off, + * hence vcpu_spin_lock() is the one. */ +- vcpu_schedule_lock(v); ++ spinlock_t *lock = vcpu_schedule_lock(v); ++ + EDOM_INFO(v)->extraweight = op->u.sedf.weight; + EDOM_INFO(v)->weight = 0; + EDOM_INFO(v)->slice = 0; + EDOM_INFO(v)->period = WEIGHT_PERIOD; +- vcpu_schedule_unlock(v); ++ vcpu_schedule_unlock(lock, v); + } + } + else + { + /* Weight-driven domains with real-time execution */ +- for_each_vcpu ( p, v ) { +- vcpu_schedule_lock(v); ++ for_each_vcpu ( p, v ) ++ { ++ spinlock_t *lock = vcpu_schedule_lock(v); ++ + EDOM_INFO(v)->weight = op->u.sedf.weight; +- vcpu_schedule_unlock(v); ++ vcpu_schedule_unlock(lock, v); + } + } + } +@@ -1454,14 +1459,15 @@ static int sedf_adjust(const struct sche + /* Time-driven domains */ + for_each_vcpu ( p, v ) + { +- vcpu_schedule_lock(v); ++ spinlock_t *lock = vcpu_schedule_lock(v); ++ + EDOM_INFO(v)->weight = 0; + EDOM_INFO(v)->extraweight = 0; + EDOM_INFO(v)->period_orig = + EDOM_INFO(v)->period = op->u.sedf.period; + EDOM_INFO(v)->slice_orig = + EDOM_INFO(v)->slice = op->u.sedf.slice; +- vcpu_schedule_unlock(v); ++ vcpu_schedule_unlock(lock, v); + } + } + +@@ -1471,13 +1477,14 @@ static int sedf_adjust(const struct sche + + for_each_vcpu ( p, v ) + { +- vcpu_schedule_lock(v); ++ spinlock_t *lock = vcpu_schedule_lock(v); ++ + EDOM_INFO(v)->status = + (EDOM_INFO(v)->status & + ~EXTRA_AWARE) | (op->u.sedf.extratime & EXTRA_AWARE); + EDOM_INFO(v)->latency = op->u.sedf.latency; + extraq_check(v); +- vcpu_schedule_unlock(v); ++ vcpu_schedule_unlock(lock, v); + } + } + else if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo ) +--- a/xen/common/schedule.c ++++ b/xen/common/schedule.c +@@ -160,18 +160,16 @@ static inline void vcpu_runstate_change( + + void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate) + { ++ spinlock_t *lock = likely(v == current) ? NULL : vcpu_schedule_lock_irq(v); + s_time_t delta; + +- if ( unlikely(v != current) ) +- vcpu_schedule_lock_irq(v); +- + memcpy(runstate, &v->runstate, sizeof(*runstate)); + delta = NOW() - runstate->state_entry_time; + if ( delta > 0 ) + runstate->time[runstate->state] += delta; + +- if ( unlikely(v != current) ) +- vcpu_schedule_unlock_irq(v); ++ if ( unlikely(lock != NULL) ) ++ vcpu_schedule_unlock_irq(lock, v); + } + + uint64_t get_cpu_idle_time(unsigned int cpu) +@@ -333,8 +331,7 @@ void sched_destroy_domain(struct domain + void vcpu_sleep_nosync(struct vcpu *v) + { + unsigned long flags; +- +- vcpu_schedule_lock_irqsave(v, flags); ++ spinlock_t *lock = vcpu_schedule_lock_irqsave(v, &flags); + + if ( likely(!vcpu_runnable(v)) ) + { +@@ -344,7 +341,7 @@ void vcpu_sleep_nosync(struct vcpu *v) + SCHED_OP(VCPU2OP(v), sleep, v); + } + +- vcpu_schedule_unlock_irqrestore(v, flags); ++ vcpu_schedule_unlock_irqrestore(lock, flags, v); + + TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id); + } +@@ -362,8 +359,7 @@ void vcpu_sleep_sync(struct vcpu *v) + void vcpu_wake(struct vcpu *v) + { + unsigned long flags; +- +- vcpu_schedule_lock_irqsave(v, flags); ++ spinlock_t *lock = vcpu_schedule_lock_irqsave(v, &flags); + + if ( likely(vcpu_runnable(v)) ) + { +@@ -377,7 +373,7 @@ void vcpu_wake(struct vcpu *v) + vcpu_runstate_change(v, RUNSTATE_offline, NOW()); + } + +- vcpu_schedule_unlock_irqrestore(v, flags); ++ vcpu_schedule_unlock_irqrestore(lock, flags, v); + + TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id); + } +@@ -528,10 +524,11 @@ static void vcpu_migrate(struct vcpu *v) + */ + void vcpu_force_reschedule(struct vcpu *v) + { +- vcpu_schedule_lock_irq(v); ++ spinlock_t *lock = vcpu_schedule_lock_irq(v); ++ + if ( v->is_running ) + set_bit(_VPF_migrating, &v->pause_flags); +- vcpu_schedule_unlock_irq(v); ++ vcpu_schedule_unlock_irq(lock, v); + + if ( test_bit(_VPF_migrating, &v->pause_flags) ) + { +@@ -546,7 +543,7 @@ void restore_vcpu_affinity(struct domain + + for_each_vcpu ( d, v ) + { +- vcpu_schedule_lock_irq(v); ++ spinlock_t *lock = vcpu_schedule_lock_irq(v); + + if ( v->affinity_broken ) + { +@@ -559,13 +556,13 @@ void restore_vcpu_affinity(struct domain + if ( v->processor == smp_processor_id() ) + { + set_bit(_VPF_migrating, &v->pause_flags); +- vcpu_schedule_unlock_irq(v); ++ vcpu_schedule_unlock_irq(lock, v); + vcpu_sleep_nosync(v); + vcpu_migrate(v); + } + else + { +- vcpu_schedule_unlock_irq(v); ++ vcpu_schedule_unlock_irq(lock, v); + } + } + +@@ -592,7 +589,7 @@ int cpu_disable_scheduler(unsigned int c + { + for_each_vcpu ( d, v ) + { +- vcpu_schedule_lock_irq(v); ++ spinlock_t *lock = vcpu_schedule_lock_irq(v); + + cpumask_and(&online_affinity, v->cpu_affinity, c->cpu_valid); + if ( cpumask_empty(&online_affinity) && +@@ -613,13 +610,13 @@ int cpu_disable_scheduler(unsigned int c + if ( v->processor == cpu ) + { + set_bit(_VPF_migrating, &v->pause_flags); +- vcpu_schedule_unlock_irq(v); ++ vcpu_schedule_unlock_irq(lock, v); + vcpu_sleep_nosync(v); + vcpu_migrate(v); + } + else + { +- vcpu_schedule_unlock_irq(v); ++ vcpu_schedule_unlock_irq(lock, v); + } + + /* +@@ -646,6 +643,7 @@ int vcpu_set_affinity(struct vcpu *v, co + { + cpumask_t online_affinity; + cpumask_t *online; ++ spinlock_t *lock; + + if ( v->domain->is_pinned ) + return -EINVAL; +@@ -654,7 +652,7 @@ int vcpu_set_affinity(struct vcpu *v, co + if ( cpumask_empty(&online_affinity) ) + return -EINVAL; + +- vcpu_schedule_lock_irq(v); ++ lock = vcpu_schedule_lock_irq(v); + + cpumask_copy(v->cpu_affinity, affinity); + +@@ -662,7 +660,7 @@ int vcpu_set_affinity(struct vcpu *v, co + * when changing the affinity */ + set_bit(_VPF_migrating, &v->pause_flags); + +- vcpu_schedule_unlock_irq(v); ++ vcpu_schedule_unlock_irq(lock, v); + + domain_update_node_affinity(v->domain); + +@@ -776,10 +774,10 @@ static long do_poll(struct sched_poll *s + static long do_yield(void) + { + struct vcpu * v=current; ++ spinlock_t *lock = vcpu_schedule_lock_irq(v); + +- vcpu_schedule_lock_irq(v); + SCHED_OP(VCPU2OP(v), yield, v); +- vcpu_schedule_unlock_irq(v); ++ vcpu_schedule_unlock_irq(lock, v); + + TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id); + raise_softirq(SCHEDULE_SOFTIRQ); +@@ -1140,6 +1138,7 @@ static void schedule(void) + unsigned long *tasklet_work = &this_cpu(tasklet_work_to_do); + bool_t tasklet_work_scheduled = 0; + struct schedule_data *sd; ++ spinlock_t *lock; + struct task_slice next_slice; + int cpu = smp_processor_id(); + +@@ -1166,7 +1165,7 @@ static void schedule(void) + BUG(); + } + +- pcpu_schedule_lock_irq(cpu); ++ lock = pcpu_schedule_lock_irq(cpu); + + stop_timer(&sd->s_timer); + +@@ -1183,7 +1182,7 @@ static void schedule(void) + + if ( unlikely(prev == next) ) + { +- pcpu_schedule_unlock_irq(cpu); ++ pcpu_schedule_unlock_irq(lock, cpu); + trace_continue_running(next); + return continue_running(prev); + } +@@ -1221,7 +1220,7 @@ static void schedule(void) + ASSERT(!next->is_running); + next->is_running = 1; + +- pcpu_schedule_unlock_irq(cpu); ++ pcpu_schedule_unlock_irq(lock, cpu); + + SCHED_STAT_CRANK(sched_ctx); + +@@ -1408,6 +1407,7 @@ int schedule_cpu_switch(unsigned int cpu + { + unsigned long flags; + struct vcpu *idle; ++ spinlock_t *lock; + void *ppriv, *ppriv_old, *vpriv, *vpriv_old; + struct scheduler *old_ops = per_cpu(scheduler, cpu); + struct scheduler *new_ops = (c == NULL) ? &ops : c->sched; +@@ -1426,7 +1426,7 @@ int schedule_cpu_switch(unsigned int cpu + return -ENOMEM; + } + +- pcpu_schedule_lock_irqsave(cpu, flags); ++ lock = pcpu_schedule_lock_irqsave(cpu, &flags); + + SCHED_OP(old_ops, tick_suspend, cpu); + vpriv_old = idle->sched_priv; +@@ -1437,7 +1437,7 @@ int schedule_cpu_switch(unsigned int cpu + SCHED_OP(new_ops, tick_resume, cpu); + SCHED_OP(new_ops, insert_vcpu, idle); + +- pcpu_schedule_unlock_irqrestore(cpu, flags); ++ pcpu_schedule_unlock_irqrestore(lock, flags, cpu); + + SCHED_OP(old_ops, free_vdata, vpriv_old); + SCHED_OP(old_ops, free_pdata, ppriv_old, cpu); +@@ -1495,10 +1495,11 @@ void schedule_dump(struct cpupool *c) + + for_each_cpu (i, cpus) + { +- pcpu_schedule_lock(i); ++ spinlock_t *lock = pcpu_schedule_lock(i); ++ + printk("CPU[%02d] ", i); + SCHED_OP(sched, dump_cpu_state, i); +- pcpu_schedule_unlock(i); ++ pcpu_schedule_unlock(lock, i); + } + } + +--- a/xen/include/xen/sched-if.h ++++ b/xen/include/xen/sched-if.h +@@ -47,96 +47,70 @@ DECLARE_PER_CPU(struct schedule_data, sc + DECLARE_PER_CPU(struct scheduler *, scheduler); + DECLARE_PER_CPU(struct cpupool *, cpupool); + +-static inline spinlock_t * pcpu_schedule_lock(int cpu) +-{ +- spinlock_t * lock=NULL; +- +- for ( ; ; ) +- { +- /* The per_cpu(v->processor) may also change, if changing +- * cpu pool also changes the scheduler lock. Retry +- * until they match. +- */ +- lock=per_cpu(schedule_data, cpu).schedule_lock; +- +- spin_lock(lock); +- if ( likely(lock == per_cpu(schedule_data, cpu).schedule_lock) ) +- break; +- spin_unlock(lock); +- } +- return lock; ++#define sched_lock(kind, param, cpu, irq, arg...) \ ++static inline spinlock_t *kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \ ++{ \ ++ for ( ; ; ) \ ++ { \ ++ spinlock_t *lock = per_cpu(schedule_data, cpu).schedule_lock; \ ++ /* \ ++ * v->processor may change when grabbing the lock; but \ ++ * per_cpu(v->processor) may also change, if changing cpu pool \ ++ * also changes the scheduler lock. Retry until they match. \ ++ * \ ++ * It may also be the case that v->processor may change but the \ ++ * lock may be the same; this will succeed in that case. \ ++ */ \ ++ spin_lock##irq(lock, ## arg); \ ++ if ( likely(lock == per_cpu(schedule_data, cpu).schedule_lock) ) \ ++ return lock; \ ++ spin_unlock##irq(lock, ## arg); \ ++ } \ + } + +-static inline int pcpu_schedule_trylock(int cpu) +-{ +- spinlock_t * lock=NULL; +- +- lock=per_cpu(schedule_data, cpu).schedule_lock; +- if ( ! spin_trylock(lock) ) +- return 0; +- if ( lock == per_cpu(schedule_data, cpu).schedule_lock ) +- return 1; +- else +- { +- spin_unlock(lock); +- return 0; +- } ++#define sched_unlock(kind, param, cpu, irq, arg...) \ ++static inline void kind##_schedule_unlock##irq(spinlock_t *lock \ ++ EXTRA_TYPE(arg), param) \ ++{ \ ++ ASSERT(lock == per_cpu(schedule_data, cpu).schedule_lock); \ ++ spin_unlock##irq(lock, ## arg); \ + } + +-#define pcpu_schedule_lock_irq(p) \ +- do { local_irq_disable(); pcpu_schedule_lock(p); } while ( 0 ) +-#define pcpu_schedule_lock_irqsave(p, flags) \ +- do { local_irq_save(flags); pcpu_schedule_lock(p); } while ( 0 ) ++#define EXTRA_TYPE(arg) ++sched_lock(pcpu, unsigned int cpu, cpu, ) ++sched_lock(vcpu, const struct vcpu *v, v->processor, ) ++sched_lock(pcpu, unsigned int cpu, cpu, _irq) ++sched_lock(vcpu, const struct vcpu *v, v->processor, _irq) ++sched_unlock(pcpu, unsigned int cpu, cpu, ) ++sched_unlock(vcpu, const struct vcpu *v, v->processor, ) ++sched_unlock(pcpu, unsigned int cpu, cpu, _irq) ++sched_unlock(vcpu, const struct vcpu *v, v->processor, _irq) ++#undef EXTRA_TYPE ++ ++#define EXTRA_TYPE(arg) , unsigned long arg ++#define spin_unlock_irqsave spin_unlock_irqrestore ++sched_lock(pcpu, unsigned int cpu, cpu, _irqsave, *flags) ++sched_lock(vcpu, const struct vcpu *v, v->processor, _irqsave, *flags) ++#undef spin_unlock_irqsave ++sched_unlock(pcpu, unsigned int cpu, cpu, _irqrestore, flags) ++sched_unlock(vcpu, const struct vcpu *v, v->processor, _irqrestore, flags) ++#undef EXTRA_TYPE ++ ++#undef sched_unlock ++#undef sched_lock + +-static inline void pcpu_schedule_unlock(int cpu) ++static inline spinlock_t *pcpu_schedule_trylock(unsigned int cpu) + { +- spin_unlock(per_cpu(schedule_data, cpu).schedule_lock); +-} ++ spinlock_t *lock = per_cpu(schedule_data, cpu).schedule_lock; + +-#define pcpu_schedule_unlock_irq(p) \ +- do { pcpu_schedule_unlock(p); local_irq_enable(); } while ( 0 ) +-#define pcpu_schedule_unlock_irqrestore(p, flags) \ +- do { pcpu_schedule_unlock(p); local_irq_restore(flags); } while ( 0 ) +- +-static inline void vcpu_schedule_lock(struct vcpu *v) +-{ +- spinlock_t * lock; +- +- for ( ; ; ) +- { +- /* v->processor may change when grabbing the lock; but +- * per_cpu(v->processor) may also change, if changing +- * cpu pool also changes the scheduler lock. Retry +- * until they match. +- * +- * It may also be the case that v->processor may change +- * but the lock may be the same; this will succeed +- * in that case. +- */ +- lock=per_cpu(schedule_data, v->processor).schedule_lock; +- +- spin_lock(lock); +- if ( likely(lock == per_cpu(schedule_data, v->processor).schedule_lock) ) +- break; +- spin_unlock(lock); +- } +-} +- +-#define vcpu_schedule_lock_irq(v) \ +- do { local_irq_disable(); vcpu_schedule_lock(v); } while ( 0 ) +-#define vcpu_schedule_lock_irqsave(v, flags) \ +- do { local_irq_save(flags); vcpu_schedule_lock(v); } while ( 0 ) +- +-static inline void vcpu_schedule_unlock(struct vcpu *v) +-{ +- spin_unlock(per_cpu(schedule_data, v->processor).schedule_lock); ++ if ( !spin_trylock(lock) ) ++ return NULL; ++ if ( lock == per_cpu(schedule_data, cpu).schedule_lock ) ++ return lock; ++ spin_unlock(lock); ++ return NULL; + } + +-#define vcpu_schedule_unlock_irq(v) \ +- do { vcpu_schedule_unlock(v); local_irq_enable(); } while ( 0 ) +-#define vcpu_schedule_unlock_irqrestore(v, flags) \ +- do { vcpu_schedule_unlock(v); local_irq_restore(flags); } while ( 0 ) +- + struct task_slice { + struct vcpu *task; + s_time_t time; diff --git a/525b9617-sched-fix-race-between-sched_move_domain-and-vcpu_wake.patch b/525b9617-sched-fix-race-between-sched_move_domain-and-vcpu_wake.patch new file mode 100644 index 0000000..ba86fce --- /dev/null +++ b/525b9617-sched-fix-race-between-sched_move_domain-and-vcpu_wake.patch @@ -0,0 +1,63 @@ +# Commit ef55257bc81204e34691f1c2aa9e01f2d0768bdd +# Date 2013-10-14 08:58:31 +0200 +# Author David Vrabel +# Committer Jan Beulich +sched: fix race between sched_move_domain() and vcpu_wake() + +From: David Vrabel + +sched_move_domain() changes v->processor for all the domain's VCPUs. +If another domain, softirq etc. triggers a simultaneous call to +vcpu_wake() (e.g., by setting an event channel as pending), then +vcpu_wake() may lock one schedule lock and try to unlock another. + +vcpu_schedule_lock() attempts to handle this but only does so for the +window between reading the schedule_lock from the per-CPU data and the +spin_lock() call. This does not help with sched_move_domain() +changing v->processor between the calls to vcpu_schedule_lock() and +vcpu_schedule_unlock(). + +Fix the race by taking the schedule_lock for v->processor in +sched_move_domain(). + +Signed-off-by: David Vrabel +Acked-by: Juergen Gross + +Use vcpu_schedule_lock_irq() (which now returns the lock) to properly +retry the locking should the to be used lock have changed in the course +of acquiring it (issue pointed out by George Dunlap). + +Add a comment explaining the state after the v->processor adjustment. + +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper +Acked-by: Keir Fraser + +--- a/xen/common/schedule.c ++++ b/xen/common/schedule.c +@@ -276,6 +276,8 @@ int sched_move_domain(struct domain *d, + new_p = cpumask_first(c->cpu_valid); + for_each_vcpu ( d, v ) + { ++ spinlock_t *lock; ++ + vcpudata = v->sched_priv; + + migrate_timer(&v->periodic_timer, new_p); +@@ -283,7 +285,16 @@ int sched_move_domain(struct domain *d, + migrate_timer(&v->poll_timer, new_p); + + cpumask_setall(v->cpu_affinity); ++ ++ lock = vcpu_schedule_lock_irq(v); + v->processor = new_p; ++ /* ++ * With v->processor modified we must not ++ * - make any further changes assuming we hold the scheduler lock, ++ * - use vcpu_schedule_unlock_irq(). ++ */ ++ spin_unlock_irq(lock); ++ + v->sched_priv = vcpu_priv[v->vcpu_id]; + evtchn_move_pirqs(v); + diff --git a/525e69e8-credit-unpause-parked-vcpu-before-destroying-it.patch b/525e69e8-credit-unpause-parked-vcpu-before-destroying-it.patch new file mode 100644 index 0000000..451d926 --- /dev/null +++ b/525e69e8-credit-unpause-parked-vcpu-before-destroying-it.patch @@ -0,0 +1,27 @@ +# Commit d38a668b6ef8c84d1d3fda9947ffb0056d01fe3a +# Date 2013-10-16 12:26:48 +0200 +# Author Juergen Gross +# Committer Jan Beulich +credit: unpause parked vcpu before destroying it + +A capped out vcpu must be unpaused in case of moving it to another cpupool, +otherwise it will be paused forever. + +Signed-off-by: Juergen Gross +Acked-by: George Dunlap + +--- a/xen/common/sched_credit.c ++++ b/xen/common/sched_credit.c +@@ -931,6 +931,12 @@ csched_vcpu_remove(const struct schedule + + SCHED_STAT_CRANK(vcpu_destroy); + ++ if ( test_and_clear_bit(CSCHED_FLAG_VCPU_PARKED, &svc->flags) ) ++ { ++ SCHED_STAT_CRANK(vcpu_unpark); ++ vcpu_unpause(svc->vcpu); ++ } ++ + if ( __vcpu_on_runq(svc) ) + __runq_remove(svc); + diff --git a/525faf5e-x86-print-relevant-tail-part-of-filename-for-warnings-and-crashes.patch b/525faf5e-x86-print-relevant-tail-part-of-filename-for-warnings-and-crashes.patch new file mode 100644 index 0000000..1aec395 --- /dev/null +++ b/525faf5e-x86-print-relevant-tail-part-of-filename-for-warnings-and-crashes.patch @@ -0,0 +1,77 @@ +# Commit f72cb6bbc10348f4f7671428e5db509731e9e6a5 +# Date 2013-10-17 11:35:26 +0200 +# Author Jan Beulich +# Committer Jan Beulich +x86: print relevant (tail) part of filename for warnings and crashes + +In particular when the origin construct is in a header file (and +hence the file name is an absolute path instead of just the file name +portion) the information can otherwise become rather useless when the +build tree isn't sitting relatively close to the file system root. + +Signed-off-by: Jan Beulich +Acked-by: Keir Fraser + +--- a/xen/arch/x86/traps.c ++++ b/xen/arch/x86/traps.c +@@ -953,7 +953,7 @@ void do_invalid_op(struct cpu_user_regs + { + struct bug_frame bug; + struct bug_frame_str bug_str; +- const char *p, *filename, *predicate, *eip = (char *)regs->eip; ++ const char *p, *prefix = "", *filename, *predicate, *eip = (char *)regs->eip; + unsigned long fixup; + int id, lineno; + +@@ -995,12 +995,19 @@ void do_invalid_op(struct cpu_user_regs + } + + /* WARN, BUG or ASSERT: decode the filename pointer and line number. */ +- filename = p; ++ fixup = strlen(p); ++ if ( fixup > 50 ) ++ { ++ filename = p + fixup - 47; ++ prefix = "..."; ++ } ++ else ++ filename = p; + lineno = bug.id >> 2; + + if ( id == BUGFRAME_warn ) + { +- printk("Xen WARN at %.50s:%d\n", filename, lineno); ++ printk("Xen WARN at %s%s:%d\n", prefix, filename, lineno); + show_execution_state(regs); + regs->eip = (unsigned long)eip; + return; +@@ -1008,10 +1015,10 @@ void do_invalid_op(struct cpu_user_regs + + if ( id == BUGFRAME_bug ) + { +- printk("Xen BUG at %.50s:%d\n", filename, lineno); ++ printk("Xen BUG at %s%s:%d\n", prefix, filename, lineno); + DEBUGGER_trap_fatal(TRAP_invalid_op, regs); + show_execution_state(regs); +- panic("Xen BUG at %.50s:%d\n", filename, lineno); ++ panic("Xen BUG at %s%s:%d\n", prefix, filename, lineno); + } + + /* ASSERT: decode the predicate string pointer. */ +@@ -1025,12 +1032,12 @@ void do_invalid_op(struct cpu_user_regs + + if ( !is_kernel(predicate) ) + predicate = ""; +- printk("Assertion '%s' failed at %.50s:%d\n", +- predicate, filename, lineno); ++ printk("Assertion '%s' failed at %s%s:%d\n", ++ predicate, prefix, filename, lineno); + DEBUGGER_trap_fatal(TRAP_invalid_op, regs); + show_execution_state(regs); +- panic("Assertion '%s' failed at %.50s:%d\n", +- predicate, filename, lineno); ++ panic("Assertion '%s' failed at %s%s:%d\n", ++ predicate, prefix, filename, lineno); + + die: + if ( (fixup = search_exception_table(regs->eip)) != 0 ) diff --git a/CVE-2013-4375-xsa71.patch b/CVE-2013-4375-xsa71.patch new file mode 100644 index 0000000..6b59cd3 --- /dev/null +++ b/CVE-2013-4375-xsa71.patch @@ -0,0 +1,33 @@ +References: bnc#842515 CVE-2013-4375 XSA-71 + +xen_disk: mark ioreq as mapped before unmapping in error case + +Commit c6961b7d ("xen_disk: use bdrv_aio_flush instead of bdrv_flush") +modified the semantics of ioreq_{un,}map so that they are idempotent if +called when they're not needed (ie., twice in a row). However, it neglected +to handle the case where batch mapping is not being used (the default), and +one of the grants fails to map. In this case, ioreq_unmap will be called to +unwind and unmap any mappings already performed, but ioreq_unmap simply +returns due to the aforementioned change (the ioreq has not already been +marked as mapped). + +The frontend user can therefore force xen_disk to leak grant mappings, a +per-backend-domain limited resource. + +Fix by marking the ioreq as mapped before calling ioreq_unmap in this +situation. + +This is XSA-71 / CVE-2013-4375 + +Signed-off-by: Matthew Daley + +--- a/tools/qemu-xen-dir-remote/hw/xen_disk.c ++++ b/tools/qemu-xen-dir-remote/hw/xen_disk.c +@@ -406,6 +406,7 @@ static int ioreq_map(struct ioreq *ioreq + xen_be_printf(&ioreq->blkdev->xendev, 0, + "can't map grant ref %d (%s, %d maps)\n", + refs[i], strerror(errno), ioreq->blkdev->cnt_map); ++ ioreq->mapped = 1; + ioreq_unmap(ioreq); + return -1; + } diff --git a/pygrub-boot-legacy-sles.patch b/pygrub-boot-legacy-sles.patch new file mode 100644 index 0000000..8aa62cf --- /dev/null +++ b/pygrub-boot-legacy-sles.patch @@ -0,0 +1,34 @@ +Index: xen-4.3.0-testing/tools/pygrub/src/pygrub +=================================================================== +--- xen-4.3.0-testing.orig/tools/pygrub/src/pygrub ++++ xen-4.3.0-testing/tools/pygrub/src/pygrub +@@ -606,6 +606,14 @@ def run_grub(file, entry, fs, cfg_args): + print " args: %s" % img.args + print " initrd: %s" % img.initrd[1] + ++ # If grub has no menu entries to select, look for vmlinuz-xen and initrd-xen in /boot ++ if len(g.cf.images) == 0: ++ chosencfg = { "kernel": None, "ramdisk": None, "args": "" } ++ chosencfg = sniff_xen_kernel(fs, incfg) ++ if chosencfg["kernel"] and chosencfg["ramdisk"]: ++ chosencfg["args"] = cfg_args ++ return chosencfg ++ + if interactive and not list_entries: + curses.wrapper(run_main) + else: +@@ -692,6 +700,14 @@ def sniff_netware(fs, cfg): + + return cfg + ++def sniff_xen_kernel(fs, cfg): ++ if not cfg["kernel"] and fs.file_exists('/boot/vmlinuz-xen'): ++ cfg["kernel"] = '/boot/vmlinuz-xen' ++ if cfg["kernel"] and not cfg["ramdisk"]: ++ if fs.file_exists('/boot/initrd-xen'): ++ cfg["ramdisk"] = '/boot/initrd-xen' ++ return cfg ++ + def format_sxp(kernel, ramdisk, args): + s = "linux (kernel %s)" % kernel + if ramdisk: diff --git a/set-mtu-from-bridge-for-tap-interface.patch b/set-mtu-from-bridge-for-tap-interface.patch new file mode 100644 index 0000000..258f687 --- /dev/null +++ b/set-mtu-from-bridge-for-tap-interface.patch @@ -0,0 +1,61 @@ +# HG changeset patch +# User Charles Arnold +# Date 1379427987 -3600 +# Node ID e6da6ffd6749237316d4440799f0a0272bbdae9c +# Parent 5597ce99ec7f2587a29f3b2dee0bde98d59bf327 +tools/hotplug: set mtu from bridge for tap interface + +With changeset 22885 support was added for setting the MTU in the vif-bridge +script for when a vif interface was set to 'online'. The was not done for the +'add' operation. The 'add' operation was added to the script for when tap +devices were specified (c/s 21944). With the setting of the MTU for the +'online' case was there a reason for omitting the 'add'? + +This patch sets the MTU for both 'online' and 'add' in the vif-bridge script. + +Signed-off-by: Charles Arnold +Acked-by: Ian Campbell + +Index: xen-4.3.0-testing/tools/hotplug/Linux/vif-bridge +=================================================================== +--- xen-4.3.0-testing.orig/tools/hotplug/Linux/vif-bridge ++++ xen-4.3.0-testing/tools/hotplug/Linux/vif-bridge +@@ -89,11 +89,7 @@ fi + case "$command" in + online) + setup_virtual_bridge_port "$dev" +- mtu="`ip link show $bridge | awk '/mtu/ { print $5 }'`" +- if [ -n "$mtu" ] && [ "$mtu" -gt 0 ] +- then +- ip link set $dev mtu $mtu || : +- fi ++ set_mtu "$bridge" "$dev" + add_to_bridge "$bridge" "$dev" + ;; + +@@ -104,6 +100,7 @@ case "$command" in + + add) + setup_virtual_bridge_port "$dev" ++ set_mtu "$bridge" "$dev" + add_to_bridge "$bridge" "$dev" + ;; + esac +Index: xen-4.3.0-testing/tools/hotplug/Linux/xen-network-common.sh +=================================================================== +--- xen-4.3.0-testing.orig/tools/hotplug/Linux/xen-network-common.sh ++++ xen-4.3.0-testing/tools/hotplug/Linux/xen-network-common.sh +@@ -132,3 +132,13 @@ add_to_bridge () { + ip link set ${dev} up + } + ++# Usage: set_mtu bridge dev ++set_mtu () { ++ local bridge=$1 ++ local dev=$2 ++ mtu="`ip link show ${bridge}| awk '/mtu/ { print $5 }'`" ++ if [ -n "$mtu" ] && [ "$mtu" -gt 0 ] ++ then ++ ip link set ${dev} mtu $mtu || : ++ fi ++} diff --git a/x86-cpufreq-report.patch b/x86-cpufreq-report.patch index 45db59e..b2d9c70 100644 --- a/x86-cpufreq-report.patch +++ b/x86-cpufreq-report.patch @@ -9,7 +9,7 @@ #include #include #include -@@ -597,6 +597,41 @@ ret_t do_platform_op(XEN_GUEST_HANDLE_PA +@@ -601,6 +601,41 @@ ret_t do_platform_op(XEN_GUEST_HANDLE_PA } break; diff --git a/xen.changes b/xen.changes index 01c360c..210cfe4 100644 --- a/xen.changes +++ b/xen.changes @@ -1,9 +1,51 @@ +------------------------------------------------------------------- +Tue Oct 22 13:42:54 MDT 2013 - carnold@suse.com + +- domUloader can no longer be used with the xl toolstack to boot + sles10. Patch pygrub to get the kernel and initrd from the image. + pygrub-boot-legacy-sles.patch + +------------------------------------------------------------------- +Mon Oct 21 09:57:54 MDT 2013 - carnold@suse.com + +- bnc#842515 - VUL-0: CVE-2013-4375: XSA-71: xen: qemu disk backend + (qdisk) resource leak + CVE-2013-4375-xsa71.patch +- Upstream patches from Jan + 52496bea-x86-properly-handle-hvm_copy_from_guest_-phys-virt-errors.patch (Replaces CVE-2013-4355-xsa63.patch) + 52496c11-x86-mm-shadow-Fix-initialization-of-PV-shadow-L4-tables.patch (Replaces CVE-2013-4356-xsa64.patch) + 52496c32-x86-properly-set-up-fbld-emulation-operand-address.patch (Replaces CVE-2013-4361-xsa66.patch) + 52497c6c-x86-don-t-blindly-create-L3-tables-for-the-direct-map.patch + 524e971b-x86-idle-Fix-get_cpu_idle_time-s-interaction-with-offline-pcpus.patch + 524e9762-x86-percpu-Force-INVALID_PERCPU_AREA-to-non-canonical.patch + 524e983e-Nested-VMX-check-VMX-capability-before-read-VMX-related-MSRs.patch + 524e98b1-Nested-VMX-fix-IA32_VMX_CR4_FIXED1-msr-emulation.patch + 524e9dc0-xsm-forbid-PV-guest-console-reads.patch + 5256a979-x86-check-segment-descriptor-read-result-in-64-bit-OUTS-emulation.patch + 5256be57-libxl-fix-vif-rate-parsing.patch + 5256be84-tools-ocaml-fix-erroneous-free-of-cpumap-in-stub_xc_vcpu_getaffinity.patch + 5256be92-libxl-fix-out-of-memory-error-handling-in-libxl_list_cpupool.patch + 5257a89a-x86-correct-LDT-checks.patch + 5257a8e7-x86-add-address-validity-check-to-guest_map_l1e.patch + 5257a944-x86-check-for-canonical-address-before-doing-page-walks.patch + 525b95f4-scheduler-adjust-internal-locking-interface.patch + 525b9617-sched-fix-race-between-sched_move_domain-and-vcpu_wake.patch + 525e69e8-credit-unpause-parked-vcpu-before-destroying-it.patch + 525faf5e-x86-print-relevant-tail-part-of-filename-for-warnings-and-crashes.patch + ------------------------------------------------------------------- Wed Oct 2 15:58:47 MDT 2013 - jfehlig@suse.com - Improvements to block-dmmd script bnc#828623 +------------------------------------------------------------------- +Tue Oct 1 15:28:25 MDT 2013 - carnold@suse.com + +- bnc#840196 - L3: MTU size on Dom0 gets reset when booting DomU + with e1000 device + set-mtu-from-bridge-for-tap-interface.patch + ------------------------------------------------------------------- Mon Sep 30 10:48:29 MDT 2013 - carnold@suse.com diff --git a/xen.spec b/xen.spec index ee881dc..b16b41b 100644 --- a/xen.spec +++ b/xen.spec @@ -139,7 +139,7 @@ BuildRequires: xorg-x11 BuildRequires: lndir %endif %endif -Version: 4.3.0_12 +Version: 4.3.0_14 Release: 0 PreReq: %insserv_prereq %fillup_prereq Summary: Xen Virtualization: Hypervisor (aka VMM aka Microkernel) @@ -254,9 +254,27 @@ Patch57: 523c1834-unmodified_drivers-enable-unplug-per-default.patch Patch58: 523ff393-x86-HVM-linear-address-must-be-canonical-for-the-whole-accessed-range.patch Patch59: 523ff3e2-x86-HVM-refuse-doing-string-operations-in-certain-situations.patch Patch60: 5242a1b5-x86-xsave-initialize-extended-register-state-when-guests-enable-it.patch -Patch6300: CVE-2013-4355-xsa63.patch -Patch6400: CVE-2013-4356-xsa64.patch -Patch6600: CVE-2013-4361-xsa66.patch +Patch61: 52496bea-x86-properly-handle-hvm_copy_from_guest_-phys-virt-errors.patch +Patch62: 52496c11-x86-mm-shadow-Fix-initialization-of-PV-shadow-L4-tables.patch +Patch63: 52496c32-x86-properly-set-up-fbld-emulation-operand-address.patch +Patch64: 52497c6c-x86-don-t-blindly-create-L3-tables-for-the-direct-map.patch +Patch65: 524e971b-x86-idle-Fix-get_cpu_idle_time-s-interaction-with-offline-pcpus.patch +Patch66: 524e9762-x86-percpu-Force-INVALID_PERCPU_AREA-to-non-canonical.patch +Patch67: 524e983e-Nested-VMX-check-VMX-capability-before-read-VMX-related-MSRs.patch +Patch68: 524e98b1-Nested-VMX-fix-IA32_VMX_CR4_FIXED1-msr-emulation.patch +Patch69: 524e9dc0-xsm-forbid-PV-guest-console-reads.patch +Patch70: 5256a979-x86-check-segment-descriptor-read-result-in-64-bit-OUTS-emulation.patch +Patch71: 5256be57-libxl-fix-vif-rate-parsing.patch +Patch72: 5256be84-tools-ocaml-fix-erroneous-free-of-cpumap-in-stub_xc_vcpu_getaffinity.patch +Patch73: 5256be92-libxl-fix-out-of-memory-error-handling-in-libxl_list_cpupool.patch +Patch74: 5257a89a-x86-correct-LDT-checks.patch +Patch75: 5257a8e7-x86-add-address-validity-check-to-guest_map_l1e.patch +Patch76: 5257a944-x86-check-for-canonical-address-before-doing-page-walks.patch +Patch77: 525b95f4-scheduler-adjust-internal-locking-interface.patch +Patch78: 525b9617-sched-fix-race-between-sched_move_domain-and-vcpu_wake.patch +Patch79: 525e69e8-credit-unpause-parked-vcpu-before-destroying-it.patch +Patch80: 525faf5e-x86-print-relevant-tail-part-of-filename-for-warnings-and-crashes.patch +Patch7100: CVE-2013-4375-xsa71.patch # Upstream qemu patches # Our patches Patch301: xen-destdir.patch @@ -283,10 +301,12 @@ Patch350: hibernate.patch Patch351: stdvga-cache.patch Patch352: ipxe-enable-nics.patch Patch353: pygrub-netware-xnloader.patch +Patch354: pygrub-boot-legacy-sles.patch Patch360: blktapctrl-close-fifos.patch Patch361: blktapctrl-default-to-ioemu.patch Patch362: blktapctrl-disable-debug-printf.patch Patch363: blktap-pv-cdrom.patch +Patch364: set-mtu-from-bridge-for-tap-interface.patch # Hypervisor and PV driver Patches Patch501: x86-ioapic-ack-default.patch Patch502: x86-cpufreq-report.patch @@ -605,9 +625,27 @@ Authors %patch58 -p1 %patch59 -p1 %patch60 -p1 -%patch6300 -p1 -%patch6400 -p1 -%patch6600 -p1 +%patch61 -p1 +%patch62 -p1 +%patch63 -p1 +%patch64 -p1 +%patch65 -p1 +%patch66 -p1 +%patch67 -p1 +%patch68 -p1 +%patch69 -p1 +%patch70 -p1 +%patch71 -p1 +%patch72 -p1 +%patch73 -p1 +%patch74 -p1 +%patch75 -p1 +%patch76 -p1 +%patch77 -p1 +%patch78 -p1 +%patch79 -p1 +%patch80 -p1 +%patch7100 -p1 %patch301 -p1 %patch302 -p1 %patch303 -p1 @@ -630,10 +668,12 @@ Authors %patch351 -p1 %patch352 -p1 %patch353 -p1 +%patch354 -p1 %patch360 -p1 %patch361 -p1 %patch362 -p1 %patch363 -p1 +%patch364 -p1 %patch501 -p1 %patch502 -p1 %patch503 -p1