diff --git a/520119fc-xen-conring-Write-to-console-ring-even-if-console-lock-is-busted.patch b/520119fc-xen-conring-Write-to-console-ring-even-if-console-lock-is-busted.patch new file mode 100644 index 0000000..d62b8bb --- /dev/null +++ b/520119fc-xen-conring-Write-to-console-ring-even-if-console-lock-is-busted.patch @@ -0,0 +1,40 @@ +# Commit 66450c1d1ab3c4480bbba949113b95d1ab6a943a +# Date 2013-08-06 17:45:00 +0200 +# Author Andrew Cooper +# Committer Jan Beulich +xen/conring: Write to console ring even if console lock is busted + +console_lock_busted gets set when an NMI/MCE/Double Fault handler decides to +bring Xen down in an emergency. conring_puts() cannot block and does +not have problematic interactions with the console_lock. + +Therefore, choosing to not put the string into the console ring simply means +that the kexec environment cant find any panic() message caused by an IST +interrupt, which is unhelpful for debugging purposes. + +In the case that two pcpus fight with console_force_unlock(), having slightly +garbled strings in the console ring is far more useful than having nothing at +all. + +Signed-off-by: Andrew Cooper +Acked-by: Matt Wilson +Acked-by: Keir Fraser + +--- a/xen/drivers/char/console.c ++++ b/xen/drivers/char/console.c +@@ -463,12 +463,11 @@ static void __putstr(const char *str) + sercon_puts(str); + video_puts(str); + ++ while ( (c = *str++) != '\0' ) ++ putchar_console_ring(c); ++ + if ( !console_locks_busted ) +- { +- while ( (c = *str++) != '\0' ) +- putchar_console_ring(c); + tasklet_schedule(¬ify_dom0_con_ring_tasklet); +- } + } + + static int printk_prefix_check(char *p, char **pp) diff --git a/520a2705-watchdog-crash-Always-disable-watchdog-in-console_force_unlock.patch b/520a2705-watchdog-crash-Always-disable-watchdog-in-console_force_unlock.patch new file mode 100644 index 0000000..3960102 --- /dev/null +++ b/520a2705-watchdog-crash-Always-disable-watchdog-in-console_force_unlock.patch @@ -0,0 +1,49 @@ +# Commit 7b9fa702ca323164d6b49e8b639a57f880454a8c +# Date 2013-08-13 14:31:01 +0200 +# Author Andrew Cooper +# Committer Jan Beulich +watchdog/crash: Always disable watchdog in console_force_unlock() + +Depending on the state of the conring and serial_tx_buffer, +console_force_unlock() can be a long running operation, usually because of +serial_start_sync() + +XenServer testing has found a reliable case where console_force_unlock() on +one PCPU takes long enough for another PCPU to timeout due to the watchdog +(such as waiting for a tlb flush callin). + +The watchdog timeout causes the second PCPU to repeat the +console_force_unlock(), at which point the first PCPU typically fails an +assertion in spin_unlock_irqrestore(&port->tx_lock) (because the tx_lock has +been unlocked behind itself). + +console_force_unlock() is only on emergency paths, so one way or another the +host is going down. Disable the watchdog before forcing the console lock to +help prevent having pcpus completing with each other to bring the host down. + +Signed-off-by: Andrew Cooper +Acked-by: Keir Fraser + +--- a/xen/arch/x86/x86_64/traps.c ++++ b/xen/arch/x86/x86_64/traps.c +@@ -226,8 +226,6 @@ void do_double_fault(struct cpu_user_reg + unsigned int cpu; + unsigned long crs[8]; + +- watchdog_disable(); +- + console_force_unlock(); + + asm ( "lsll %1, %0" : "=r" (cpu) : "rm" (PER_CPU_GDT_ENTRY << 3) ); +--- a/xen/drivers/char/console.c ++++ b/xen/drivers/char/console.c +@@ -736,6 +736,9 @@ void console_end_log_everything(void) + + void console_force_unlock(void) + { ++#ifdef CONFIG_X86 ++ watchdog_disable(); ++#endif + spin_lock_init(&console_lock); + serial_force_unlock(sercon_handle); + console_locks_busted = 1; diff --git a/521c6d4a-x86-don-t-allow-Dom0-access-to-the-MSI-address-range.patch b/521c6d4a-x86-don-t-allow-Dom0-access-to-the-MSI-address-range.patch index d0dd632..6259d1b 100644 --- a/521c6d4a-x86-don-t-allow-Dom0-access-to-the-MSI-address-range.patch +++ b/521c6d4a-x86-don-t-allow-Dom0-access-to-the-MSI-address-range.patch @@ -9,8 +9,8 @@ In particular, MMIO assignments should not be done using this area. Signed-off-by: Jan Beulich Acked-by Xiantao Zhang ---- 2013-08-30.orig/xen/arch/x86/domain_build.c 2013-07-09 20:57:12.000000000 +0200 -+++ 2013-08-30/xen/arch/x86/domain_build.c 2013-09-09 11:23:00.000000000 +0200 +--- a/xen/arch/x86/domain_build.c ++++ b/xen/arch/x86/domain_build.c @@ -1122,6 +1122,10 @@ int __init construct_dom0( if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn) ) rc |= iomem_deny_access(dom0, mfn, mfn); diff --git a/521c6d6c-x86-don-t-allow-Dom0-access-to-the-HT-address-range.patch b/521c6d6c-x86-don-t-allow-Dom0-access-to-the-HT-address-range.patch index cb8fdb4..c066b16 100644 --- a/521c6d6c-x86-don-t-allow-Dom0-access-to-the-HT-address-range.patch +++ b/521c6d6c-x86-don-t-allow-Dom0-access-to-the-HT-address-range.patch @@ -8,8 +8,8 @@ In particular, MMIO assignments should not be done using this area. Signed-off-by: Jan Beulich ---- 2013-08-30.orig/xen/arch/x86/domain_build.c 2013-09-09 11:23:00.000000000 +0200 -+++ 2013-08-30/xen/arch/x86/domain_build.c 2013-09-09 11:23:06.000000000 +0200 +--- a/xen/arch/x86/domain_build.c ++++ b/xen/arch/x86/domain_build.c @@ -1126,6 +1126,10 @@ int __init construct_dom0( rc |= iomem_deny_access(dom0, paddr_to_pfn(MSI_ADDR_BASE_LO), paddr_to_pfn(MSI_ADDR_BASE_LO + diff --git a/521c6e23-x86-Intel-add-support-for-Haswell-CPU-models.patch b/521c6e23-x86-Intel-add-support-for-Haswell-CPU-models.patch index b8cd049..966c5d5 100644 --- a/521c6e23-x86-Intel-add-support-for-Haswell-CPU-models.patch +++ b/521c6e23-x86-Intel-add-support-for-Haswell-CPU-models.patch @@ -9,8 +9,8 @@ x86/Intel: add support for Haswell CPU models Signed-off-by: Jan Beulich Acked-by: Keir Fraser ---- 2013-08-30.orig/xen/arch/x86/acpi/cpu_idle.c 2013-08-30 00:00:00.000000000 +0200 -+++ 2013-08-30/xen/arch/x86/acpi/cpu_idle.c 2013-09-06 13:46:10.000000000 +0200 +--- a/xen/arch/x86/acpi/cpu_idle.c ++++ b/xen/arch/x86/acpi/cpu_idle.c @@ -135,8 +135,10 @@ static void do_get_hw_residencies(void * case 0x3A: case 0x3E: @@ -23,8 +23,8 @@ Acked-by: Keir Fraser GET_PC2_RES(hw_res->pc2); GET_CC7_RES(hw_res->cc7); /* fall through */ ---- 2013-08-30.orig/xen/arch/x86/hvm/vmx/vmx.c 2013-09-06 00:00:00.000000000 +0200 -+++ 2013-08-30/xen/arch/x86/hvm/vmx/vmx.c 2013-09-06 13:46:10.000000000 +0200 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -1814,7 +1814,7 @@ static const struct lbr_info *last_branc /* Ivy Bridge */ case 58: case 62: @@ -34,8 +34,8 @@ Acked-by: Keir Fraser return nh_lbr; break; /* Atom */ ---- 2013-08-30.orig/xen/arch/x86/hvm/vmx/vpmu_core2.c 2013-07-09 20:57:12.000000000 +0200 -+++ 2013-08-30/xen/arch/x86/hvm/vmx/vpmu_core2.c 2013-09-06 13:46:10.000000000 +0200 +--- a/xen/arch/x86/hvm/vmx/vpmu_core2.c ++++ b/xen/arch/x86/hvm/vmx/vpmu_core2.c @@ -878,7 +878,12 @@ int vmx_vpmu_initialise(struct vcpu *v, case 0x3a: /* IvyBridge */ diff --git a/521db25f-Fix-inactive-timer-list-corruption-on-second-S3-resume.patch b/521db25f-Fix-inactive-timer-list-corruption-on-second-S3-resume.patch index 9b1dbe8..fdd9ba2 100644 --- a/521db25f-Fix-inactive-timer-list-corruption-on-second-S3-resume.patch +++ b/521db25f-Fix-inactive-timer-list-corruption-on-second-S3-resume.patch @@ -13,8 +13,8 @@ Moved resume_timer initialisation to ns16550_init_postirq, so it's only done onc Signed-off-by: Tomasz Wroblewski Acked-by: Keir Fraser ---- 2013-08-30.orig/xen/drivers/char/ns16550.c 2013-07-09 20:57:12.000000000 +0200 -+++ 2013-08-30/xen/drivers/char/ns16550.c 2013-09-06 13:46:19.000000000 +0200 +--- a/xen/drivers/char/ns16550.c ++++ b/xen/drivers/char/ns16550.c @@ -128,6 +128,8 @@ static struct ns16550 { #define RESUME_DELAY MILLISECS(10) #define RESUME_RETRIES 100 diff --git a/521e1156-x86-AVX-instruction-emulation-fixes.patch b/521e1156-x86-AVX-instruction-emulation-fixes.patch index d26866e..4a51069 100644 --- a/521e1156-x86-AVX-instruction-emulation-fixes.patch +++ b/521e1156-x86-AVX-instruction-emulation-fixes.patch @@ -21,8 +21,8 @@ Also add respective test cases to the testing utility plus Signed-off-by: Jan Beulich Acked-by: Keir Fraser ---- 2013-08-30.orig/tools/tests/x86_emulator/test_x86_emulator.c 2012-09-18 23:42:06.000000000 +0200 -+++ 2013-08-30/tools/tests/x86_emulator/test_x86_emulator.c 2013-09-09 11:23:32.000000000 +0200 +--- a/tools/tests/x86_emulator/test_x86_emulator.c ++++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -94,13 +94,25 @@ static inline uint64_t xgetbv(uint32_t x } @@ -204,8 +204,8 @@ Acked-by: Keir Fraser for ( j = 1; j <= 2; j++ ) { #if defined(__i386__) ---- 2013-08-30.orig/xen/arch/x86/x86_emulate/x86_emulate.c 2013-07-09 20:57:12.000000000 +0200 -+++ 2013-08-30/xen/arch/x86/x86_emulate/x86_emulate.c 2013-09-09 11:23:33.000000000 +0200 +--- a/xen/arch/x86/x86_emulate/x86_emulate.c ++++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -1454,10 +1454,10 @@ x86_emulate( /* VEX */ generate_exception_if(rex_prefix || vex.pfx, EXC_UD, -1); diff --git a/521ef8d9-AMD-IOMMU-add-missing-checks.patch b/521ef8d9-AMD-IOMMU-add-missing-checks.patch index 1b10c7e..950201a 100644 --- a/521ef8d9-AMD-IOMMU-add-missing-checks.patch +++ b/521ef8d9-AMD-IOMMU-add-missing-checks.patch @@ -11,8 +11,8 @@ Signed-off-by: Jan Beulich Reviewed-by: Andrew Cooper Acked-by: Suravee Suthikulpanit ---- 2013-08-30.orig/xen/drivers/passthrough/amd/iommu_acpi.c 2013-08-30 13:48:36.000000000 +0200 -+++ 2013-08-30/xen/drivers/passthrough/amd/iommu_acpi.c 2013-09-06 13:49:07.000000000 +0200 +--- a/xen/drivers/passthrough/amd/iommu_acpi.c ++++ b/xen/drivers/passthrough/amd/iommu_acpi.c @@ -674,6 +674,13 @@ static u16 __init parse_ivhd_device_spec if ( IO_APIC_ID(apic) != special->handle ) continue; diff --git a/52205a7d-hvmloader-smbios-Correctly-count-the-number-of-tables-written.patch b/52205a7d-hvmloader-smbios-Correctly-count-the-number-of-tables-written.patch index b970686..983c37a 100644 --- a/52205a7d-hvmloader-smbios-Correctly-count-the-number-of-tables-written.patch +++ b/52205a7d-hvmloader-smbios-Correctly-count-the-number-of-tables-written.patch @@ -14,8 +14,8 @@ written. Signed-off-by: Andrew Cooper Acked-by: Keir Fraser ---- 2013-08-30.orig/tools/firmware/hvmloader/smbios.c 2013-07-09 20:57:12.000000000 +0200 -+++ 2013-08-30/tools/firmware/hvmloader/smbios.c 2013-09-09 11:23:52.000000000 +0200 +--- a/tools/firmware/hvmloader/smbios.c ++++ b/tools/firmware/hvmloader/smbios.c @@ -192,7 +192,8 @@ write_smbios_tables(void *ep, void *star #define do_struct(fn) do { \ diff --git a/52205a90-public-hvm_xs_strings.h-Fix-ABI-regression-for-OEM-SMBios-strings.patch b/52205a90-public-hvm_xs_strings.h-Fix-ABI-regression-for-OEM-SMBios-strings.patch index 05f4a7d..6585c0c 100644 --- a/52205a90-public-hvm_xs_strings.h-Fix-ABI-regression-for-OEM-SMBios-strings.patch +++ b/52205a90-public-hvm_xs_strings.h-Fix-ABI-regression-for-OEM-SMBios-strings.patch @@ -30,8 +30,8 @@ feel it is justified as: Signed-off-by: Andrew Cooper Acked-by: Keir Fraser ---- 2013-08-30.orig/xen/include/public/hvm/hvm_xs_strings.h 2013-07-09 20:57:12.000000000 +0200 -+++ 2013-08-30/xen/include/public/hvm/hvm_xs_strings.h 2013-09-09 11:23:57.000000000 +0200 +--- a/xen/include/public/hvm/hvm_xs_strings.h ++++ b/xen/include/public/hvm/hvm_xs_strings.h @@ -75,6 +75,6 @@ /* 1 to 99 OEM strings can be set in xenstore using values of the form * below. These strings will be loaded into the SMBIOS type 11 structure. diff --git a/52205e27-x86-xsave-initialization-improvements.patch b/52205e27-x86-xsave-initialization-improvements.patch index f65a16f..1384556 100644 --- a/52205e27-x86-xsave-initialization-improvements.patch +++ b/52205e27-x86-xsave-initialization-improvements.patch @@ -1,3 +1,5 @@ +References: bnc#833796 + # Commit c6066e78f4a66005b0d5d86c6ade32e2ab78923a # Date 2013-08-30 10:56:07 +0200 # Author Jan Beulich @@ -12,8 +14,8 @@ x86/xsave: initialization improvements Signed-off-by: Jan Beulich Acked-by: Keir Fraser ---- 2013-08-30.orig/xen/arch/x86/cpu/common.c 2013-08-30 00:00:00.000000000 +0200 -+++ 2013-08-30/xen/arch/x86/cpu/common.c 2013-09-09 11:24:05.000000000 +0200 +--- a/xen/arch/x86/cpu/common.c ++++ b/xen/arch/x86/cpu/common.c @@ -304,7 +304,7 @@ void __cpuinit identify_cpu(struct cpuin clear_bit(X86_FEATURE_XSAVE, boot_cpu_data.x86_capability); @@ -23,8 +25,8 @@ Acked-by: Keir Fraser /* * The vendor-specific functions might have changed features. Now ---- 2013-08-30.orig/xen/arch/x86/xstate.c 2013-09-09 11:21:56.000000000 +0200 -+++ 2013-08-30/xen/arch/x86/xstate.c 2013-09-09 11:24:05.000000000 +0200 +--- a/xen/arch/x86/xstate.c ++++ b/xen/arch/x86/xstate.c @@ -247,11 +247,10 @@ void xstate_free_save_area(struct vcpu * } @@ -91,8 +93,8 @@ Acked-by: Keir Fraser } int handle_xsetbv(u32 index, u64 new_bv) ---- 2013-08-30.orig/xen/include/asm-x86/xstate.h 2013-07-09 20:57:12.000000000 +0200 -+++ 2013-08-30/xen/include/asm-x86/xstate.h 2013-09-09 11:24:05.000000000 +0200 +--- a/xen/include/asm-x86/xstate.h ++++ b/xen/include/asm-x86/xstate.h @@ -81,6 +81,6 @@ int __must_check handle_xsetbv(u32 index /* extended state init and cleanup functions */ void xstate_free_save_area(struct vcpu *v); diff --git a/5226020f-xend-handle-extended-PCI-configuration-space-when-saving-state.patch b/5226020f-xend-handle-extended-PCI-configuration-space-when-saving-state.patch index 795b865..78b3ca9 100644 --- a/5226020f-xend-handle-extended-PCI-configuration-space-when-saving-state.patch +++ b/5226020f-xend-handle-extended-PCI-configuration-space-when-saving-state.patch @@ -16,8 +16,8 @@ Reviewed-by: Matt Wilson [msw: adjusted commit message] Signed-off-by: Matt Wilson ---- 2013-08-30.orig/tools/python/xen/util/pci.py 2013-09-09 11:21:53.000000000 +0200 -+++ 2013-08-30/tools/python/xen/util/pci.py 2013-09-09 11:24:09.000000000 +0200 +--- a/tools/python/xen/util/pci.py ++++ b/tools/python/xen/util/pci.py @@ -521,8 +521,9 @@ def save_pci_conf_space(devs_string): pci_path = sysfs_mnt + SYSFS_PCI_DEVS_PATH + '/' + pci_str + \ SYSFS_PCI_DEV_CONFIG_PATH diff --git a/52260214-xend-fix-file-descriptor-leak-in-pci-utilities.patch b/52260214-xend-fix-file-descriptor-leak-in-pci-utilities.patch index acf1bbf..0f2f8e7 100644 --- a/52260214-xend-fix-file-descriptor-leak-in-pci-utilities.patch +++ b/52260214-xend-fix-file-descriptor-leak-in-pci-utilities.patch @@ -12,8 +12,8 @@ Reviewed-by: Matt Wilson [msw: adjusted commit message] Signed-off-by: Matt Wilson ---- 2013-08-30.orig/tools/python/xen/util/pci.py 2013-09-09 11:24:09.000000000 +0200 -+++ 2013-08-30/tools/python/xen/util/pci.py 2013-09-09 11:24:14.000000000 +0200 +--- a/tools/python/xen/util/pci.py ++++ b/tools/python/xen/util/pci.py @@ -969,18 +969,22 @@ class PciDevice: ttl = 480; # 3840 bytes, minimum 8 bytes per capability pos = 0x100 diff --git a/52285317-hvmloader-fix-SeaBIOS-interface.patch b/52285317-hvmloader-fix-SeaBIOS-interface.patch index 1f7e73b..9b97af2 100644 --- a/52285317-hvmloader-fix-SeaBIOS-interface.patch +++ b/52285317-hvmloader-fix-SeaBIOS-interface.patch @@ -17,8 +17,8 @@ into a function so that it would actually compile. Signed-off-by: Jan Beulich Acked-by: Ian Campbell ---- 2013-08-30.orig/tools/firmware/hvmloader/config-seabios.h 2013-07-09 20:57:12.000000000 +0200 -+++ 2013-08-30/tools/firmware/hvmloader/config-seabios.h 2013-09-09 11:24:23.000000000 +0200 +--- a/tools/firmware/hvmloader/config-seabios.h ++++ b/tools/firmware/hvmloader/config-seabios.h @@ -3,8 +3,6 @@ #define BIOS_INFO_PHYSICAL_ADDRESS 0x00001000 @@ -28,8 +28,8 @@ Acked-by: Ian Campbell #endif /* __HVMLOADER_CONFIG_SEABIOS_H__ */ /* ---- 2013-08-30.orig/tools/firmware/hvmloader/hvmloader.c 2013-07-09 20:57:12.000000000 +0200 -+++ 2013-08-30/tools/firmware/hvmloader/hvmloader.c 2013-09-09 11:24:23.000000000 +0200 +--- a/tools/firmware/hvmloader/hvmloader.c ++++ b/tools/firmware/hvmloader/hvmloader.c @@ -292,8 +292,12 @@ int main(void) if ( bios->bios_load ) bios->bios_load(bios); @@ -43,8 +43,8 @@ Acked-by: Ian Campbell if ( (hvm_info->nr_vcpus > 1) || hvm_info->apic_mode ) { ---- 2013-08-30.orig/tools/firmware/hvmloader/rombios.c 2013-07-09 20:57:12.000000000 +0200 -+++ 2013-08-30/tools/firmware/hvmloader/rombios.c 2013-09-09 11:24:23.000000000 +0200 +--- a/tools/firmware/hvmloader/rombios.c ++++ b/tools/firmware/hvmloader/rombios.c @@ -127,6 +127,8 @@ static void rombios_load(const struct bi uint32_t bioshigh; struct rombios_info *info; @@ -63,8 +63,8 @@ Acked-by: Ian Campbell struct bios_config rombios_config = { .name = "ROMBIOS", ---- 2013-08-30.orig/tools/firmware/hvmloader/seabios.c 2013-07-09 20:57:12.000000000 +0200 -+++ 2013-08-30/tools/firmware/hvmloader/seabios.c 2013-09-09 11:24:23.000000000 +0200 +--- a/tools/firmware/hvmloader/seabios.c ++++ b/tools/firmware/hvmloader/seabios.c @@ -133,15 +133,13 @@ static void seabios_setup_e820(void) dump_e820_table(e820, info->e820_nr); } diff --git a/522d896b-x86-EFI-properly-handle-run-time-memory-regions-outside-the-1-1-map.patch b/522d896b-x86-EFI-properly-handle-run-time-memory-regions-outside-the-1-1-map.patch new file mode 100644 index 0000000..e4b079b --- /dev/null +++ b/522d896b-x86-EFI-properly-handle-run-time-memory-regions-outside-the-1-1-map.patch @@ -0,0 +1,195 @@ +References: bnc#833251, bnc#834751 + +# Commit a350f3f43bcfac9c1591e28d8e43c505fcb172a5 +# Date 2013-09-09 10:40:11 +0200 +# Author Jan Beulich +# Committer Jan Beulich +x86/EFI: properly handle run time memory regions outside the 1:1 map + +Namely with PFN compression, MMIO ranges that the firmware may need +runtime access to can live in the holes that gets shrunk/eliminated by +PFN compression, and hence no mappings would result from simply +copying Xen's direct mapping table's L3 page table entries. Build +mappings for this "manually" in the EFI runtime call 1:1 page tables. + +Use the opportunity to also properly identify (via a forcibly undefined +manifest constant) all the disabled code regions associated with it not +being acceptable for us to call SetVirtualAddressMap(). + +Signed-off-by: Jan Beulich + +--- a/xen/arch/x86/efi/boot.c ++++ b/xen/arch/x86/efi/boot.c +@@ -26,6 +26,9 @@ + #include + #include + ++/* Using SetVirtualAddressMap() is incompatible with kexec: */ ++#undef USE_SET_VIRTUAL_ADDRESS_MAP ++ + #define SHIM_LOCK_PROTOCOL_GUID \ + { 0x605dab50, 0xe046, 0x4300, {0xab, 0xb6, 0x3d, 0xd8, 0x10, 0xdd, 0x8b, 0x23} } + +@@ -1434,7 +1437,7 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SY + + /* Adjust pointers into EFI. */ + efi_ct = (void *)efi_ct + DIRECTMAP_VIRT_START; +-#if 0 /* Only needed when using virtual mode (see efi_init_memory()). */ ++#ifdef USE_SET_VIRTUAL_ADDRESS_MAP + efi_rs = (void *)efi_rs + DIRECTMAP_VIRT_START; + #endif + efi_memmap = (void *)efi_memmap + DIRECTMAP_VIRT_START; +@@ -1477,6 +1480,7 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SY + for( ; ; ); /* not reached */ + } + ++#ifndef USE_SET_VIRTUAL_ADDRESS_MAP + static __init void copy_mapping(unsigned long mfn, unsigned long end, + bool_t (*is_valid)(unsigned long smfn, + unsigned long emfn)) +@@ -1520,6 +1524,7 @@ static bool_t __init rt_range_valid(unsi + { + return 1; + } ++#endif + + #define INVALID_VIRTUAL_ADDRESS (0xBAAADUL << \ + (EFI_PAGE_SHIFT + BITS_PER_LONG - 32)) +@@ -1527,6 +1532,13 @@ static bool_t __init rt_range_valid(unsi + void __init efi_init_memory(void) + { + unsigned int i; ++#ifndef USE_SET_VIRTUAL_ADDRESS_MAP ++ struct rt_extra { ++ struct rt_extra *next; ++ unsigned long smfn, emfn; ++ unsigned int prot; ++ } *extra, *extra_head = NULL; ++#endif + + printk(XENLOG_INFO "EFI memory map:\n"); + for ( i = 0; i < efi_memmap_size; i += efi_mdesc_size ) +@@ -1573,6 +1585,8 @@ void __init efi_init_memory(void) + !(smfn & pfn_hole_mask) && + !((smfn ^ (emfn - 1)) & ~pfn_pdx_bottom_mask) ) + { ++ if ( (unsigned long)mfn_to_virt(emfn - 1) >= HYPERVISOR_VIRT_END ) ++ prot &= ~_PAGE_GLOBAL; + if ( map_pages_to_xen((unsigned long)mfn_to_virt(smfn), + smfn, emfn - smfn, prot) == 0 ) + desc->VirtualStart = +@@ -1581,15 +1595,29 @@ void __init efi_init_memory(void) + printk(XENLOG_ERR "Could not map MFNs %#lx-%#lx\n", + smfn, emfn - 1); + } ++#ifndef USE_SET_VIRTUAL_ADDRESS_MAP ++ else if ( !((desc->PhysicalStart + len - 1) >> (VADDR_BITS - 1)) && ++ (extra = xmalloc(struct rt_extra)) != NULL ) ++ { ++ extra->smfn = smfn; ++ extra->emfn = emfn; ++ extra->prot = prot & ~_PAGE_GLOBAL; ++ extra->next = extra_head; ++ extra_head = extra; ++ desc->VirtualStart = desc->PhysicalStart; ++ } ++#endif + else + { ++#ifdef USE_SET_VIRTUAL_ADDRESS_MAP + /* XXX allocate e.g. down from FIXADDR_START */ ++#endif + printk(XENLOG_ERR "No mapping for MFNs %#lx-%#lx\n", + smfn, emfn - 1); + } + } + +-#if 0 /* Incompatible with kexec. */ ++#ifdef USE_SET_VIRTUAL_ADDRESS_MAP + efi_rs->SetVirtualAddressMap(efi_memmap_size, efi_mdesc_size, + mdesc_ver, efi_memmap); + #else +@@ -1600,20 +1628,74 @@ void __init efi_init_memory(void) + + copy_mapping(0, max_page, ram_range_valid); + +- /* Insert non-RAM runtime mappings. */ ++ /* Insert non-RAM runtime mappings inside the direct map. */ + for ( i = 0; i < efi_memmap_size; i += efi_mdesc_size ) + { + const EFI_MEMORY_DESCRIPTOR *desc = efi_memmap + i; + +- if ( desc->Attribute & EFI_MEMORY_RUNTIME ) ++ if ( (desc->Attribute & EFI_MEMORY_RUNTIME) && ++ desc->VirtualStart != INVALID_VIRTUAL_ADDRESS && ++ desc->VirtualStart != desc->PhysicalStart ) ++ copy_mapping(PFN_DOWN(desc->PhysicalStart), ++ PFN_UP(desc->PhysicalStart + ++ (desc->NumberOfPages << EFI_PAGE_SHIFT)), ++ rt_range_valid); ++ } ++ ++ /* Insert non-RAM runtime mappings outside of the direct map. */ ++ while ( (extra = extra_head) != NULL ) ++ { ++ unsigned long addr = extra->smfn << PAGE_SHIFT; ++ l4_pgentry_t l4e = efi_l4_pgtable[l4_table_offset(addr)]; ++ l3_pgentry_t *pl3e; ++ l2_pgentry_t *pl2e; ++ l1_pgentry_t *l1t; ++ ++ if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) + { +- if ( desc->VirtualStart != INVALID_VIRTUAL_ADDRESS ) +- copy_mapping(PFN_DOWN(desc->PhysicalStart), +- PFN_UP(desc->PhysicalStart + +- (desc->NumberOfPages << EFI_PAGE_SHIFT)), +- rt_range_valid); +- else +- /* XXX */; ++ pl3e = alloc_xen_pagetable(); ++ BUG_ON(!pl3e); ++ clear_page(pl3e); ++ efi_l4_pgtable[l4_table_offset(addr)] = ++ l4e_from_paddr(virt_to_maddr(pl3e), __PAGE_HYPERVISOR); ++ } ++ else ++ pl3e = l4e_to_l3e(l4e); ++ pl3e += l3_table_offset(addr); ++ if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) ++ { ++ pl2e = alloc_xen_pagetable(); ++ BUG_ON(!pl2e); ++ clear_page(pl2e); ++ *pl3e = l3e_from_paddr(virt_to_maddr(pl2e), __PAGE_HYPERVISOR); ++ } ++ else ++ { ++ BUG_ON(l3e_get_flags(*pl3e) & _PAGE_PSE); ++ pl2e = l3e_to_l2e(*pl3e); ++ } ++ pl2e += l2_table_offset(addr); ++ if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ) ++ { ++ l1t = alloc_xen_pagetable(); ++ BUG_ON(!l1t); ++ clear_page(l1t); ++ *pl2e = l2e_from_paddr(virt_to_maddr(l1t), __PAGE_HYPERVISOR); ++ } ++ else ++ { ++ BUG_ON(l2e_get_flags(*pl2e) & _PAGE_PSE); ++ l1t = l2e_to_l1e(*pl2e); ++ } ++ for ( i = l1_table_offset(addr); ++ i < L1_PAGETABLE_ENTRIES && extra->smfn < extra->emfn; ++ ++i, ++extra->smfn ) ++ l1t[i] = l1e_from_pfn(extra->smfn, extra->prot); ++ ++ if ( extra->smfn == extra->emfn ) ++ { ++ extra_head = extra->next; ++ xfree(extra); + } + } + diff --git a/522d8a1f-x86-allow-guest-to-set-clear-MSI-X-mask-bit-try-2.patch b/522d8a1f-x86-allow-guest-to-set-clear-MSI-X-mask-bit-try-2.patch new file mode 100644 index 0000000..940b87c --- /dev/null +++ b/522d8a1f-x86-allow-guest-to-set-clear-MSI-X-mask-bit-try-2.patch @@ -0,0 +1,145 @@ +# Commit a35137373aa9042424565e5ee76dc0a3bb7642ae +# Date 2013-09-09 10:43:11 +0200 +# Author Joby Poriyath +# Committer Jan Beulich +x86: allow guest to set/clear MSI-X mask bit (try 2) + +Guest needs the ability to enable and disable MSI-X interrupts +by setting the MSI-X control bit, for a passed-through device. +Guest is allowed to write MSI-X mask bit only if Xen *thinks* +that mask is clear (interrupts enabled). If the mask is set by +Xen (interrupts disabled), writes to mask bit by the guest is +ignored. + +Currently, a write to MSI-X mask bit by the guest is silently +ignored. + +A likely scenario is where we have a 82599 SR-IOV nic passed +through to a guest. From the guest if you do + + ifconfig down + ifconfig up + +the interrupts remain masked. On VF reset, the mask bit is set +by the controller. At this point, Xen is not aware that mask is set. +However, interrupts are enabled by VF driver by clearing the mask +bit by writing directly to BAR3 region containing the MSI-X table. + +From dom0, we can verify that +interrupts are being masked using 'xl debug-keys M'. + +Initially, guest was allowed to modify MSI-X bit. +Later this behaviour was changed. +See changeset 74c213c506afcd74a8556dd092995fd4dc38b225. + +Signed-off-by: Joby Poriyath + +--- a/xen/arch/x86/hvm/vmsi.c ++++ b/xen/arch/x86/hvm/vmsi.c +@@ -187,6 +187,19 @@ static struct msixtbl_entry *msixtbl_fin + return NULL; + } + ++static struct msi_desc *virt_to_msi_desc(struct pci_dev *dev, void *virt) ++{ ++ struct msi_desc *desc; ++ ++ list_for_each_entry( desc, &dev->msi_list, list ) ++ if ( desc->msi_attrib.type == PCI_CAP_ID_MSIX && ++ virt >= desc->mask_base && ++ virt < desc->mask_base + PCI_MSIX_ENTRY_SIZE ) ++ return desc; ++ ++ return NULL; ++} ++ + static void __iomem *msixtbl_addr_to_virt( + struct msixtbl_entry *entry, unsigned long addr) + { +@@ -247,13 +260,16 @@ out: + } + + static int msixtbl_write(struct vcpu *v, unsigned long address, +- unsigned long len, unsigned long val) ++ unsigned long len, unsigned long val) + { + unsigned long offset; + struct msixtbl_entry *entry; ++ const struct msi_desc *msi_desc; + void *virt; + unsigned int nr_entry, index; + int r = X86EMUL_UNHANDLEABLE; ++ unsigned long flags, orig; ++ struct irq_desc *desc; + + if ( len != 4 || (address & 3) ) + return r; +@@ -283,22 +299,57 @@ static int msixtbl_write(struct vcpu *v, + if ( !virt ) + goto out; + +- /* Do not allow the mask bit to be changed. */ +-#if 0 /* XXX +- * As the mask bit is the only defined bit in the word, and as the +- * host MSI-X code doesn't preserve the other bits anyway, doing +- * this is pointless. So for now just discard the write (also +- * saving us from having to determine the matching irq_desc). +- */ ++ msi_desc = virt_to_msi_desc(entry->pdev, virt); ++ if ( !msi_desc || msi_desc->irq < 0 ) ++ goto out; ++ ++ desc = irq_to_desc(msi_desc->irq); ++ if ( !desc ) ++ goto out; ++ + spin_lock_irqsave(&desc->lock, flags); ++ ++ if ( !desc->msi_desc ) ++ goto unlock; ++ ++ ASSERT(msi_desc == desc->msi_desc); ++ + orig = readl(virt); +- val &= ~PCI_MSIX_VECTOR_BITMASK; +- val |= orig & PCI_MSIX_VECTOR_BITMASK; ++ ++ /* ++ * Do not allow guest to modify MSI-X control bit if it is masked ++ * by Xen. We'll only handle the case where Xen thinks that ++ * bit is unmasked, but hardware has silently masked the bit ++ * (in case of SR-IOV VF reset, etc). On the other hand, if Xen ++ * thinks that the bit is masked, but it's really not, ++ * we log a warning. ++ */ ++ if ( msi_desc->msi_attrib.masked ) ++ { ++ if ( !(orig & PCI_MSIX_VECTOR_BITMASK) ) ++ printk(XENLOG_WARNING "MSI-X control bit is unmasked when" ++ " it is expected to be masked [%04x:%02x:%02x.%u]\n", ++ entry->pdev->seg, entry->pdev->bus, ++ PCI_SLOT(entry->pdev->devfn), ++ PCI_FUNC(entry->pdev->devfn)); ++ ++ goto unlock; ++ } ++ ++ /* ++ * The mask bit is the only defined bit in the word. But we ++ * ought to preserve the reserved bits. Clearing the reserved ++ * bits can result in undefined behaviour (see PCI Local Bus ++ * Specification revision 2.3). ++ */ ++ val &= PCI_MSIX_VECTOR_BITMASK; ++ val |= (orig & ~PCI_MSIX_VECTOR_BITMASK); + writel(val, virt); +- spin_unlock_irqrestore(&desc->lock, flags); +-#endif + ++unlock: ++ spin_unlock_irqrestore(&desc->lock, flags); + r = X86EMUL_OKAY; ++ + out: + rcu_read_unlock(&msixtbl_rcu_lock); + return r; diff --git a/522dc044-xmalloc-make-whole-pages-xfree-clear-the-order-field-ab-used-by-xmalloc.patch b/522dc044-xmalloc-make-whole-pages-xfree-clear-the-order-field-ab-used-by-xmalloc.patch new file mode 100644 index 0000000..93950c8 --- /dev/null +++ b/522dc044-xmalloc-make-whole-pages-xfree-clear-the-order-field-ab-used-by-xmalloc.patch @@ -0,0 +1,27 @@ +# Commit 0fbf3208d9c1a568aeeb61d9f4fbca03b1cfa1f8 +# Date 2013-09-09 14:34:12 +0200 +# Author Jan Beulich +# Committer Jan Beulich +xmalloc: make whole pages xfree() clear the order field (ab)used by xmalloc() + +Not doing this was found to cause problems with sequences of allocation +(multi-page), freeing, and then again allocation of the same page upon +boot when interrupts are still disabled (causing the owner field to be +non-zero, thus making the allocator attempt a TLB flush and, in its +processing, triggering an assertion). + +Reported-by: Tomasz Wroblewski +Signed-off-by: Jan Beulich +Tested-by: Tomasz Wroblewski +Acked-by: Keir Fraser + +--- a/xen/common/xmalloc_tlsf.c ++++ b/xen/common/xmalloc_tlsf.c +@@ -629,6 +629,7 @@ void xfree(void *p) + unsigned int i, order = get_order_from_pages(size); + + BUG_ON((unsigned long)p & ((PAGE_SIZE << order) - 1)); ++ PFN_ORDER(virt_to_page(p)) = 0; + for ( i = 0; ; ++i ) + { + if ( !(size & (1 << i)) ) diff --git a/522dc0e6-x86-xsave-fix-migration-from-xsave-capable-to-xsave-incapable-host.patch b/522dc0e6-x86-xsave-fix-migration-from-xsave-capable-to-xsave-incapable-host.patch new file mode 100644 index 0000000..ace1ed8 --- /dev/null +++ b/522dc0e6-x86-xsave-fix-migration-from-xsave-capable-to-xsave-incapable-host.patch @@ -0,0 +1,629 @@ +References: bnc#833796 + +# Commit 4cc1344447a0458df5d222960f2adf1b65084fa8 +# Date 2013-09-09 14:36:54 +0200 +# Author Jan Beulich +# Committer Jan Beulich +x86/xsave: fix migration from xsave-capable to xsave-incapable host + +With CPUID features suitably masked this is supposed to work, but was +completely broken (i.e. the case wasn't even considered when the +original xsave save/restore code was written). + +First of all, xsave_enabled() wrongly returned the value of +cpu_has_xsave, i.e. not even taking into consideration attributes of +the vCPU in question. Instead this function ought to check whether the +guest ever enabled xsave support (by writing a [non-zero] value to +XCR0). As a result of this, a vCPU's xcr0 and xcr0_accum must no longer +be initialized to XSTATE_FP_SSE (since that's a valid value a guest +could write to XCR0), and the xsave/xrstor as well as the context +switch code need to suitably account for this (by always enforcing at +least this part of the state to be saved/loaded). + +This involves undoing large parts of c/s 22945:13a7d1f7f62c ("x86: add +strictly sanity check for XSAVE/XRSTOR") - we need to cleanly +distinguish between hardware capabilities and vCPU used features. + +Next both HVM and PV save code needed tweaking to not always save the +full state supported by the underlying hardware, but just the parts +that the guest actually used. Similarly the restore code should bail +not just on state being restored that the hardware cannot handle, but +also on inconsistent save state (inconsistent XCR0 settings or size of +saved state not in line with XCR0). + +And finally the PV extended context get/set code needs to use slightly +different logic than the HVM one, as here we can't just key off of +xsave_enabled() (i.e. avoid doing anything if a guest doesn't use +xsave) because the tools use this function to determine host +capabilities as well as read/write vCPU state. The set operation in +particular needs to be capable of cleanly dealing with input that +consists of only the xcr0 and xcr0_accum values (if they're both zero +then no further data is required). + +While for things to work correctly both sides (saving _and_ restoring +host) need to run with the fixed code, afaict no breakage should occur +if either side isn't up to date (other than the breakage that this +patch attempts to fix). + +Signed-off-by: Jan Beulich +Reviewed-by: Yang Zhang +Acked-by: Keir Fraser + +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -618,7 +618,7 @@ unsigned long pv_guest_cr4_fixup(const s + hv_cr4_mask &= ~X86_CR4_DE; + if ( cpu_has_fsgsbase && !is_pv_32bit_domain(v->domain) ) + hv_cr4_mask &= ~X86_CR4_FSGSBASE; +- if ( xsave_enabled(v) ) ++ if ( cpu_has_xsave ) + hv_cr4_mask &= ~X86_CR4_OSXSAVE; + + if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) ) +@@ -1347,9 +1347,13 @@ static void __context_switch(void) + if ( !is_idle_vcpu(n) ) + { + memcpy(stack_regs, &n->arch.user_regs, CTXT_SWITCH_STACK_BYTES); +- if ( xsave_enabled(n) && n->arch.xcr0 != get_xcr0() && +- !set_xcr0(n->arch.xcr0) ) +- BUG(); ++ if ( cpu_has_xsave ) ++ { ++ u64 xcr0 = n->arch.xcr0 ?: XSTATE_FP_SSE; ++ ++ if ( xcr0 != get_xcr0() && !set_xcr0(xcr0) ) ++ BUG(); ++ } + vcpu_restore_fpu_eager(n); + n->arch.ctxt_switch_to(n); + } +--- a/xen/arch/x86/domctl.c ++++ b/xen/arch/x86/domctl.c +@@ -1047,11 +1047,8 @@ long arch_do_domctl( + struct xen_domctl_vcpuextstate *evc; + struct vcpu *v; + uint32_t offset = 0; +- uint64_t _xfeature_mask = 0; +- uint64_t _xcr0, _xcr0_accum; +- void *receive_buf = NULL, *_xsave_area; + +-#define PV_XSAVE_SIZE (2 * sizeof(uint64_t) + xsave_cntxt_size) ++#define PV_XSAVE_SIZE(xcr0) (2 * sizeof(uint64_t) + xstate_ctxt_size(xcr0)) + + evc = &domctl->u.vcpuextstate; + +@@ -1062,15 +1059,16 @@ long arch_do_domctl( + + if ( domctl->cmd == XEN_DOMCTL_getvcpuextstate ) + { ++ unsigned int size = PV_XSAVE_SIZE(v->arch.xcr0_accum); ++ + if ( !evc->size && !evc->xfeature_mask ) + { + evc->xfeature_mask = xfeature_mask; +- evc->size = PV_XSAVE_SIZE; ++ evc->size = size; + ret = 0; + goto vcpuextstate_out; + } +- if ( evc->size != PV_XSAVE_SIZE || +- evc->xfeature_mask != xfeature_mask ) ++ if ( evc->size != size || evc->xfeature_mask != xfeature_mask ) + { + ret = -EINVAL; + goto vcpuextstate_out; +@@ -1093,7 +1091,7 @@ long arch_do_domctl( + offset += sizeof(v->arch.xcr0_accum); + if ( copy_to_guest_offset(domctl->u.vcpuextstate.buffer, + offset, (void *)v->arch.xsave_area, +- xsave_cntxt_size) ) ++ size - 2 * sizeof(uint64_t)) ) + { + ret = -EFAULT; + goto vcpuextstate_out; +@@ -1101,13 +1099,14 @@ long arch_do_domctl( + } + else + { +- ret = -EINVAL; ++ void *receive_buf; ++ uint64_t _xcr0, _xcr0_accum; ++ const struct xsave_struct *_xsave_area; + +- _xfeature_mask = evc->xfeature_mask; +- /* xsave context must be restored on compatible target CPUs */ +- if ( (_xfeature_mask & xfeature_mask) != _xfeature_mask ) +- goto vcpuextstate_out; +- if ( evc->size > PV_XSAVE_SIZE || evc->size < 2 * sizeof(uint64_t) ) ++ ret = -EINVAL; ++ if ( evc->size < 2 * sizeof(uint64_t) || ++ evc->size > 2 * sizeof(uint64_t) + ++ xstate_ctxt_size(xfeature_mask) ) + goto vcpuextstate_out; + + receive_buf = xmalloc_bytes(evc->size); +@@ -1128,20 +1127,30 @@ long arch_do_domctl( + _xcr0_accum = *(uint64_t *)(receive_buf + sizeof(uint64_t)); + _xsave_area = receive_buf + 2 * sizeof(uint64_t); + +- if ( !(_xcr0 & XSTATE_FP) || _xcr0 & ~xfeature_mask ) ++ if ( _xcr0_accum ) + { +- xfree(receive_buf); +- goto vcpuextstate_out; ++ if ( evc->size >= 2 * sizeof(uint64_t) + XSTATE_AREA_MIN_SIZE ) ++ ret = validate_xstate(_xcr0, _xcr0_accum, ++ _xsave_area->xsave_hdr.xstate_bv, ++ evc->xfeature_mask); + } +- if ( (_xcr0 & _xcr0_accum) != _xcr0 ) ++ else if ( !_xcr0 ) ++ ret = 0; ++ if ( ret ) + { + xfree(receive_buf); + goto vcpuextstate_out; + } + +- v->arch.xcr0 = _xcr0; +- v->arch.xcr0_accum = _xcr0_accum; +- memcpy(v->arch.xsave_area, _xsave_area, evc->size - 2 * sizeof(uint64_t) ); ++ if ( evc->size <= PV_XSAVE_SIZE(_xcr0_accum) ) ++ { ++ v->arch.xcr0 = _xcr0; ++ v->arch.xcr0_accum = _xcr0_accum; ++ memcpy(v->arch.xsave_area, _xsave_area, ++ evc->size - 2 * sizeof(uint64_t)); ++ } ++ else ++ ret = -EINVAL; + + xfree(receive_buf); + } +--- a/xen/arch/x86/hvm/hvm.c ++++ b/xen/arch/x86/hvm/hvm.c +@@ -906,14 +906,12 @@ static int hvm_load_cpu_ctxt(struct doma + hvm_set_segment_register(v, x86_seg_ldtr, &seg); + + /* In case xsave-absent save file is restored on a xsave-capable host */ +- if ( xsave_enabled(v) ) ++ if ( cpu_has_xsave && !xsave_enabled(v) ) + { + struct xsave_struct *xsave_area = v->arch.xsave_area; + + memcpy(v->arch.xsave_area, ctxt.fpu_regs, sizeof(ctxt.fpu_regs)); + xsave_area->xsave_hdr.xstate_bv = XSTATE_FP_SSE; +- v->arch.xcr0_accum = XSTATE_FP_SSE; +- v->arch.xcr0 = XSTATE_FP_SSE; + } + else + memcpy(v->arch.fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs)); +@@ -957,7 +955,9 @@ static int hvm_load_cpu_ctxt(struct doma + HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, hvm_load_cpu_ctxt, + 1, HVMSR_PER_VCPU); + +-#define HVM_CPU_XSAVE_SIZE (3 * sizeof(uint64_t) + xsave_cntxt_size) ++#define HVM_CPU_XSAVE_SIZE(xcr0) (offsetof(struct hvm_hw_cpu_xsave, \ ++ save_area) + \ ++ xstate_ctxt_size(xcr0)) + + static int hvm_save_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h) + { +@@ -969,20 +969,20 @@ static int hvm_save_cpu_xsave_states(str + + for_each_vcpu ( d, v ) + { ++ unsigned int size = HVM_CPU_XSAVE_SIZE(v->arch.xcr0_accum); ++ + if ( !xsave_enabled(v) ) + continue; +- if ( _hvm_init_entry(h, CPU_XSAVE_CODE, v->vcpu_id, HVM_CPU_XSAVE_SIZE) ) ++ if ( _hvm_init_entry(h, CPU_XSAVE_CODE, v->vcpu_id, size) ) + return 1; + ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur]; +- h->cur += HVM_CPU_XSAVE_SIZE; +- memset(ctxt, 0, HVM_CPU_XSAVE_SIZE); ++ h->cur += size; + + ctxt->xfeature_mask = xfeature_mask; + ctxt->xcr0 = v->arch.xcr0; + ctxt->xcr0_accum = v->arch.xcr0_accum; +- if ( v->fpu_initialised ) +- memcpy(&ctxt->save_area, +- v->arch.xsave_area, xsave_cntxt_size); ++ memcpy(&ctxt->save_area, v->arch.xsave_area, ++ size - offsetof(struct hvm_hw_cpu_xsave, save_area)); + } + + return 0; +@@ -990,11 +990,11 @@ static int hvm_save_cpu_xsave_states(str + + static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h) + { +- int vcpuid; ++ unsigned int vcpuid, size; ++ int err; + struct vcpu *v; + struct hvm_hw_cpu_xsave *ctxt; + struct hvm_save_descriptor *desc; +- uint64_t _xfeature_mask; + + /* Which vcpu is this? */ + vcpuid = hvm_load_instance(h); +@@ -1006,47 +1006,74 @@ static int hvm_load_cpu_xsave_states(str + } + + /* Fails since we can't restore an img saved on xsave-capable host. */ +- if ( !xsave_enabled(v) ) +- return -EINVAL; ++ if ( !cpu_has_xsave ) ++ return -EOPNOTSUPP; + + /* Customized checking for entry since our entry is of variable length */ + desc = (struct hvm_save_descriptor *)&h->data[h->cur]; + if ( sizeof (*desc) > h->size - h->cur) + { + printk(XENLOG_G_WARNING +- "HVM%d restore: not enough data left to read descriptor" +- "for type %u\n", d->domain_id, CPU_XSAVE_CODE); +- return -1; ++ "HVM%d.%d restore: not enough data left to read xsave descriptor\n", ++ d->domain_id, vcpuid); ++ return -ENODATA; + } + if ( desc->length + sizeof (*desc) > h->size - h->cur) + { + printk(XENLOG_G_WARNING +- "HVM%d restore: not enough data left to read %u bytes " +- "for type %u\n", d->domain_id, desc->length, CPU_XSAVE_CODE); +- return -1; ++ "HVM%d.%d restore: not enough data left to read %u xsave bytes\n", ++ d->domain_id, vcpuid, desc->length); ++ return -ENODATA; ++ } ++ if ( desc->length < offsetof(struct hvm_hw_cpu_xsave, save_area) + ++ XSTATE_AREA_MIN_SIZE ) ++ { ++ printk(XENLOG_G_WARNING ++ "HVM%d.%d restore mismatch: xsave length %u < %zu\n", ++ d->domain_id, vcpuid, desc->length, ++ offsetof(struct hvm_hw_cpu_xsave, ++ save_area) + XSTATE_AREA_MIN_SIZE); ++ return -EINVAL; + } +- if ( CPU_XSAVE_CODE != desc->typecode || (desc->length > HVM_CPU_XSAVE_SIZE) ) ++ size = HVM_CPU_XSAVE_SIZE(xfeature_mask); ++ if ( desc->length > size ) + { + printk(XENLOG_G_WARNING +- "HVM%d restore mismatch: expected type %u with max length %u, " +- "saw type %u length %u\n", d->domain_id, CPU_XSAVE_CODE, +- (unsigned int)HVM_CPU_XSAVE_SIZE, +- desc->typecode, desc->length); +- return -1; ++ "HVM%d.%d restore mismatch: xsave length %u > %u\n", ++ d->domain_id, vcpuid, desc->length, size); ++ return -EOPNOTSUPP; + } + h->cur += sizeof (*desc); +- /* Checking finished */ + + ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur]; + h->cur += desc->length; + +- _xfeature_mask = ctxt->xfeature_mask; +- if ( (_xfeature_mask & xfeature_mask) != _xfeature_mask ) +- return -EINVAL; ++ err = validate_xstate(ctxt->xcr0, ctxt->xcr0_accum, ++ ctxt->save_area.xsave_hdr.xstate_bv, ++ ctxt->xfeature_mask); ++ if ( err ) ++ { ++ printk(XENLOG_G_WARNING ++ "HVM%d.%d restore: inconsistent xsave state (feat=%#"PRIx64 ++ " accum=%#"PRIx64" xcr0=%#"PRIx64" bv=%#"PRIx64" err=%d)\n", ++ d->domain_id, vcpuid, ctxt->xfeature_mask, ctxt->xcr0_accum, ++ ctxt->xcr0, ctxt->save_area.xsave_hdr.xstate_bv, err); ++ return err; ++ } ++ size = HVM_CPU_XSAVE_SIZE(ctxt->xcr0_accum); ++ if ( desc->length > size ) ++ { ++ printk(XENLOG_G_WARNING ++ "HVM%d.%d restore mismatch: xsave length %u > %u\n", ++ d->domain_id, vcpuid, desc->length, size); ++ return -EOPNOTSUPP; ++ } ++ /* Checking finished */ + + v->arch.xcr0 = ctxt->xcr0; + v->arch.xcr0_accum = ctxt->xcr0_accum; +- memcpy(v->arch.xsave_area, &ctxt->save_area, xsave_cntxt_size); ++ memcpy(v->arch.xsave_area, &ctxt->save_area, ++ desc->length - offsetof(struct hvm_hw_cpu_xsave, save_area)); + + return 0; + } +@@ -1060,7 +1087,8 @@ static int __init __hvm_register_CPU_XSA + "CPU_XSAVE", + hvm_save_cpu_xsave_states, + hvm_load_cpu_xsave_states, +- HVM_CPU_XSAVE_SIZE + sizeof (struct hvm_save_descriptor), ++ HVM_CPU_XSAVE_SIZE(xfeature_mask) + ++ sizeof(struct hvm_save_descriptor), + HVMSR_PER_VCPU); + return 0; + } +@@ -2767,7 +2795,7 @@ void hvm_cpuid(unsigned int input, unsig + __clear_bit(X86_FEATURE_APIC & 31, edx); + + /* Fix up OSXSAVE. */ +- if ( xsave_enabled(v) ) ++ if ( cpu_has_xsave ) + *ecx |= (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE) ? + cpufeat_mask(X86_FEATURE_OSXSAVE) : 0; + +--- a/xen/arch/x86/hvm/vmx/vmcs.c ++++ b/xen/arch/x86/hvm/vmx/vmcs.c +@@ -947,8 +947,7 @@ static int construct_vmcs(struct vcpu *v + /* Host control registers. */ + v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS; + __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0); +- __vmwrite(HOST_CR4, +- mmu_cr4_features | (xsave_enabled(v) ? X86_CR4_OSXSAVE : 0)); ++ __vmwrite(HOST_CR4, mmu_cr4_features); + + /* Host CS:RIP. */ + __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS); +--- a/xen/arch/x86/i387.c ++++ b/xen/arch/x86/i387.c +@@ -38,14 +38,15 @@ static inline void fpu_xrstor(struct vcp + { + bool_t ok; + ++ ASSERT(v->arch.xsave_area); + /* + * XCR0 normally represents what guest OS set. In case of Xen itself, +- * we set all supported feature mask before doing save/restore. ++ * we set the accumulated feature mask before doing save/restore. + */ +- ok = set_xcr0(v->arch.xcr0_accum); ++ ok = set_xcr0(v->arch.xcr0_accum | XSTATE_FP_SSE); + ASSERT(ok); + xrstor(v, mask); +- ok = set_xcr0(v->arch.xcr0); ++ ok = set_xcr0(v->arch.xcr0 ?: XSTATE_FP_SSE); + ASSERT(ok); + } + +@@ -124,13 +125,15 @@ static inline void fpu_xsave(struct vcpu + { + bool_t ok; + +- /* XCR0 normally represents what guest OS set. In case of Xen itself, +- * we set all accumulated feature mask before doing save/restore. ++ ASSERT(v->arch.xsave_area); ++ /* ++ * XCR0 normally represents what guest OS set. In case of Xen itself, ++ * we set the accumulated feature mask before doing save/restore. + */ +- ok = set_xcr0(v->arch.xcr0_accum); ++ ok = set_xcr0(v->arch.xcr0_accum | XSTATE_FP_SSE); + ASSERT(ok); + xsave(v, v->arch.nonlazy_xstate_used ? XSTATE_ALL : XSTATE_LAZY); +- ok = set_xcr0(v->arch.xcr0); ++ ok = set_xcr0(v->arch.xcr0 ?: XSTATE_FP_SSE); + ASSERT(ok); + } + +@@ -238,7 +241,7 @@ void vcpu_restore_fpu_lazy(struct vcpu * + if ( v->fpu_dirtied ) + return; + +- if ( xsave_enabled(v) ) ++ if ( cpu_has_xsave ) + fpu_xrstor(v, XSTATE_LAZY); + else if ( v->fpu_initialised ) + { +@@ -268,7 +271,7 @@ void vcpu_save_fpu(struct vcpu *v) + /* This can happen, if a paravirtualised guest OS has set its CR0.TS. */ + clts(); + +- if ( xsave_enabled(v) ) ++ if ( cpu_has_xsave ) + fpu_xsave(v); + else if ( cpu_has_fxsr ) + fpu_fxsave(v); +--- a/xen/arch/x86/traps.c ++++ b/xen/arch/x86/traps.c +@@ -816,7 +816,7 @@ static void pv_cpuid(struct cpu_user_reg + __clear_bit(X86_FEATURE_PDCM % 32, &c); + __clear_bit(X86_FEATURE_PCID % 32, &c); + __clear_bit(X86_FEATURE_DCA % 32, &c); +- if ( !xsave_enabled(current) ) ++ if ( !cpu_has_xsave ) + { + __clear_bit(X86_FEATURE_XSAVE % 32, &c); + __clear_bit(X86_FEATURE_AVX % 32, &c); +@@ -841,7 +841,7 @@ static void pv_cpuid(struct cpu_user_reg + break; + + case 0x0000000d: /* XSAVE */ +- if ( !xsave_enabled(current) ) ++ if ( !cpu_has_xsave ) + goto unsupported; + break; + +--- a/xen/arch/x86/xstate.c ++++ b/xen/arch/x86/xstate.c +@@ -21,7 +21,7 @@ bool_t __read_mostly cpu_has_xsaveopt; + * the supported and enabled features on the processor, including the + * XSAVE.HEADER. We only enable XCNTXT_MASK that we have known. + */ +-u32 xsave_cntxt_size; ++static u32 __read_mostly xsave_cntxt_size; + + /* A 64-bit bitmask of the XSAVE/XRSTOR features supported by processor. */ + u64 xfeature_mask; +@@ -206,13 +206,13 @@ void xrstor(struct vcpu *v, uint64_t mas + + bool_t xsave_enabled(const struct vcpu *v) + { +- if ( cpu_has_xsave ) +- { +- ASSERT(xsave_cntxt_size >= XSTATE_AREA_MIN_SIZE); +- ASSERT(v->arch.xsave_area); +- } ++ if ( !cpu_has_xsave ) ++ return 0; + +- return cpu_has_xsave; ++ ASSERT(xsave_cntxt_size >= XSTATE_AREA_MIN_SIZE); ++ ASSERT(v->arch.xsave_area); ++ ++ return !!v->arch.xcr0_accum; + } + + int xstate_alloc_save_area(struct vcpu *v) +@@ -234,8 +234,8 @@ int xstate_alloc_save_area(struct vcpu * + save_area->xsave_hdr.xstate_bv = XSTATE_FP_SSE; + + v->arch.xsave_area = save_area; +- v->arch.xcr0 = XSTATE_FP_SSE; +- v->arch.xcr0_accum = XSTATE_FP_SSE; ++ v->arch.xcr0 = 0; ++ v->arch.xcr0_accum = 0; + + return 0; + } +@@ -253,7 +253,11 @@ void xstate_init(bool_t bsp) + u64 feature_mask; + + if ( boot_cpu_data.cpuid_level < XSTATE_CPUID ) ++ { ++ BUG_ON(!bsp); ++ setup_clear_cpu_cap(X86_FEATURE_XSAVE); + return; ++ } + + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + +@@ -273,7 +277,6 @@ void xstate_init(bool_t bsp) + set_in_cr4(X86_CR4_OSXSAVE); + if ( !set_xcr0(feature_mask) ) + BUG(); +- cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + + if ( bsp ) + { +@@ -282,14 +285,14 @@ void xstate_init(bool_t bsp) + * xsave_cntxt_size is the max size required by enabled features. + * We know FP/SSE and YMM about eax, and nothing about edx at present. + */ +- xsave_cntxt_size = ebx; ++ xsave_cntxt_size = xstate_ctxt_size(feature_mask); + printk("%s: using cntxt_size: %#x and states: %#"PRIx64"\n", + __func__, xsave_cntxt_size, xfeature_mask); + } + else + { + BUG_ON(xfeature_mask != feature_mask); +- BUG_ON(xsave_cntxt_size != ebx); ++ BUG_ON(xsave_cntxt_size != xstate_ctxt_size(feature_mask)); + } + + /* Check XSAVEOPT feature. */ +@@ -300,6 +303,42 @@ void xstate_init(bool_t bsp) + BUG_ON(!cpu_has_xsaveopt != !(eax & XSTATE_FEATURE_XSAVEOPT)); + } + ++unsigned int xstate_ctxt_size(u64 xcr0) ++{ ++ u32 ebx = 0; ++ ++ if ( xcr0 ) ++ { ++ u64 act_xcr0 = get_xcr0(); ++ u32 eax, ecx, edx; ++ bool_t ok = set_xcr0(xcr0); ++ ++ ASSERT(ok); ++ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); ++ ASSERT(ebx <= ecx); ++ ok = set_xcr0(act_xcr0); ++ ASSERT(ok); ++ } ++ ++ return ebx; ++} ++ ++int validate_xstate(u64 xcr0, u64 xcr0_accum, u64 xstate_bv, u64 xfeat_mask) ++{ ++ if ( (xcr0_accum & ~xfeat_mask) || ++ (xstate_bv & ~xcr0_accum) || ++ (xcr0 & ~xcr0_accum) || ++ !(xcr0 & XSTATE_FP) || ++ ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) || ++ ((xcr0_accum & XSTATE_YMM) && !(xcr0_accum & XSTATE_SSE)) ) ++ return -EINVAL; ++ ++ if ( xcr0_accum & ~xfeature_mask ) ++ return -EOPNOTSUPP; ++ ++ return 0; ++} ++ + int handle_xsetbv(u32 index, u64 new_bv) + { + struct vcpu *curr = current; +--- a/xen/include/asm-x86/domain.h ++++ b/xen/include/asm-x86/domain.h +@@ -456,9 +456,9 @@ unsigned long pv_guest_cr4_fixup(const s + #define pv_guest_cr4_to_real_cr4(v) \ + (((v)->arch.pv_vcpu.ctrlreg[4] \ + | (mmu_cr4_features \ +- & (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_SMEP)) \ +- | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0) \ +- | ((xsave_enabled(v))? X86_CR4_OSXSAVE : 0)) \ ++ & (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_SMEP | \ ++ X86_CR4_OSXSAVE)) \ ++ | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)) \ + & ~X86_CR4_DE) + #define real_cr4_to_pv_guest_cr4(c) \ + ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD \ +--- a/xen/include/asm-x86/hvm/hvm.h ++++ b/xen/include/asm-x86/hvm/hvm.h +@@ -368,7 +368,7 @@ static inline int hvm_event_pending(stru + ((nestedhvm_enabled((_v)->domain) && cpu_has_vmx)\ + ? X86_CR4_VMXE : 0) | \ + (cpu_has_pcid ? X86_CR4_PCIDE : 0) | \ +- (xsave_enabled(_v) ? X86_CR4_OSXSAVE : 0)))) ++ (cpu_has_xsave ? X86_CR4_OSXSAVE : 0)))) + + /* These exceptions must always be intercepted. */ + #define HVM_TRAP_MASK ((1U << TRAP_machine_check) | (1U << TRAP_invalid_op)) +--- a/xen/include/asm-x86/xstate.h ++++ b/xen/include/asm-x86/xstate.h +@@ -33,7 +33,6 @@ + #define XSTATE_NONLAZY (XSTATE_LWP) + #define XSTATE_LAZY (XSTATE_ALL & ~XSTATE_NONLAZY) + +-extern unsigned int xsave_cntxt_size; + extern u64 xfeature_mask; + + /* extended state save area */ +@@ -76,11 +75,14 @@ uint64_t get_xcr0(void); + void xsave(struct vcpu *v, uint64_t mask); + void xrstor(struct vcpu *v, uint64_t mask); + bool_t xsave_enabled(const struct vcpu *v); ++int __must_check validate_xstate(u64 xcr0, u64 xcr0_accum, u64 xstate_bv, ++ u64 xfeat_mask); + int __must_check handle_xsetbv(u32 index, u64 new_bv); + + /* extended state init and cleanup functions */ + void xstate_free_save_area(struct vcpu *v); + int xstate_alloc_save_area(struct vcpu *v); + void xstate_init(bool_t bsp); ++unsigned int xstate_ctxt_size(u64 xcr0); + + #endif /* __ASM_XSTATE_H */ diff --git a/522f2f9f-Nested-VMX-Clear-bit-31-of-IA32_VMX_BASIC-MSR.patch b/522f2f9f-Nested-VMX-Clear-bit-31-of-IA32_VMX_BASIC-MSR.patch new file mode 100644 index 0000000..e220677 --- /dev/null +++ b/522f2f9f-Nested-VMX-Clear-bit-31-of-IA32_VMX_BASIC-MSR.patch @@ -0,0 +1,25 @@ +# Commit f3a4eb9253826d1e49e682314c8666b28fa0b717 +# Date 2013-09-10 16:41:35 +0200 +# Author Yang Zhang +# Committer Jan Beulich +Nested VMX: Clear bit 31 of IA32_VMX_BASIC MSR + +The bit 31 of revision_id will set to 1 if vmcs shadowing enabled. And +according intel SDM, the bit 31 of IA32_VMX_BASIC MSR is always 0. So we +cannot set low 32 bit of IA32_VMX_BASIC to revision_id directly. Must clear +the bit 31 to 0. + +Signed-off-by: Yang Zhang +Reviewed-by: Andrew Cooper + +--- a/xen/arch/x86/hvm/vmx/vvmx.c ++++ b/xen/arch/x86/hvm/vmx/vvmx.c +@@ -1828,7 +1828,7 @@ int nvmx_msr_read_intercept(unsigned int + switch (msr) { + case MSR_IA32_VMX_BASIC: + data = (host_data & (~0ul << 32)) | +- ((v->arch.hvm_vmx.vmcs)->vmcs_revision_id); ++ (v->arch.hvm_vmx.vmcs->vmcs_revision_id & 0x7fffffff); + break; + case MSR_IA32_VMX_PINBASED_CTLS: + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: diff --git a/522f37b2-sched-arinc653-check-for-guest-data-transfer-failures.patch b/522f37b2-sched-arinc653-check-for-guest-data-transfer-failures.patch new file mode 100644 index 0000000..053cf63 --- /dev/null +++ b/522f37b2-sched-arinc653-check-for-guest-data-transfer-failures.patch @@ -0,0 +1,41 @@ +# Commit 546ba2f17008387cf9821df46e6dac04f0883a9b +# Date 2013-09-10 17:16:02 +0200 +# Author Matthew Daley +# Committer Jan Beulich +sched/arinc653: check for guest data transfer failures + +Coverity-ID: 1055121 +Coverity-ID: 1055122 +Coverity-ID: 1055123 +Coverity-ID: 1055124 +Signed-off-by: Matthew Daley +Reviewed-by: Andrew Cooper +Acked-by: George Dunlap +Acked-by: Keir Fraser + +--- a/xen/common/sched_arinc653.c ++++ b/xen/common/sched_arinc653.c +@@ -635,12 +635,21 @@ a653sched_adjust_global(const struct sch + switch ( sc->cmd ) + { + case XEN_SYSCTL_SCHEDOP_putinfo: +- copy_from_guest(&local_sched, sc->u.sched_arinc653.schedule, 1); ++ if ( copy_from_guest(&local_sched, sc->u.sched_arinc653.schedule, 1) ) ++ { ++ rc = -EFAULT; ++ break; ++ } ++ + rc = arinc653_sched_set(ops, &local_sched); + break; + case XEN_SYSCTL_SCHEDOP_getinfo: + rc = arinc653_sched_get(ops, &local_sched); +- copy_to_guest(sc->u.sched_arinc653.schedule, &local_sched, 1); ++ if ( rc ) ++ break; ++ ++ if ( copy_to_guest(sc->u.sched_arinc653.schedule, &local_sched, 1) ) ++ rc = -EFAULT; + break; + } + diff --git a/523172d5-x86-fix-memory-cut-off-when-using-PFN-compression.patch b/523172d5-x86-fix-memory-cut-off-when-using-PFN-compression.patch new file mode 100644 index 0000000..a938210 --- /dev/null +++ b/523172d5-x86-fix-memory-cut-off-when-using-PFN-compression.patch @@ -0,0 +1,116 @@ +References: bnc#839600 + +# Commit 8efce9d69998a3d3c720ac7dbdb9b7e240369957 +# Date 2013-09-12 09:52:53 +0200 +# Author Jan Beulich +# Committer Jan Beulich +x86: fix memory cut-off when using PFN compression + +For one setup_max_pdx(), when invoked a second time (after SRAT got +parsed), needs to start from the original max_page value again (using +the already adjusted one from the first invocation would not allow the +cut-off boundary to be moved up). + +Second, _if_ we need to cut off some part of memory, we must not allow +this to also propagate into the NUMA accounting. Otherwise +cutoff_node() results in nodes_cover_memory() to find some parts of +memory apparently not having a PXM association, causing all SRAT info +to be ignored. + +The only possibly problematic consumer of node_spanned_pages (the +meaning of which gets altered here in that it now also includes memory +Xen can't actively make use of) is XEN_SYSCTL_numainfo: At a first +glance the potentially larger reported memory size shouldn't confuse +tool stacks. + +And finally we must not put our boot time modules at addresses which +(at that time) can't be guaranteed to be accessible later. This applies +to both the EFI boot loader and the module relocation code. + +Signed-off-by: Jan Beulich +Acked-by: Keir Fraser +Acked-by: Dario Faggioli + +--- a/xen/arch/x86/efi/boot.c ++++ b/xen/arch/x86/efi/boot.c +@@ -459,7 +459,8 @@ static bool_t __init read_file(EFI_FILE_ + what = what ?: L"Seek"; + else + { +- file->addr = (EFI_PHYSICAL_ADDRESS)1 << (32 + PAGE_SHIFT); ++ file->addr = min(1UL << (32 + PAGE_SHIFT), ++ HYPERVISOR_VIRT_END - DIRECTMAP_VIRT_START); + ret = efi_bs->AllocatePages(AllocateMaxAddress, EfiLoaderData, + PFN_UP(size), &file->addr); + } +--- a/xen/arch/x86/setup.c ++++ b/xen/arch/x86/setup.c +@@ -377,9 +377,9 @@ static uint64_t __init consider_modules( + return e; + } + +-static void __init setup_max_pdx(void) ++static void __init setup_max_pdx(unsigned long top_page) + { +- max_pdx = pfn_to_pdx(max_page - 1) + 1; ++ max_pdx = pfn_to_pdx(top_page - 1) + 1; + + if ( max_pdx > (DIRECTMAP_SIZE >> PAGE_SHIFT) ) + max_pdx = DIRECTMAP_SIZE >> PAGE_SHIFT; +@@ -547,7 +547,7 @@ void __init __start_xen(unsigned long mb + unsigned int initrdidx; + multiboot_info_t *mbi = __va(mbi_p); + module_t *mod = (module_t *)__va(mbi->mods_addr); +- unsigned long nr_pages, modules_headroom, *module_map; ++ unsigned long nr_pages, raw_max_page, modules_headroom, *module_map; + int i, j, e820_warn = 0, bytes = 0; + bool_t acpi_boot_table_init_done = 0; + struct ns16550_defaults ns16550 = { +@@ -751,7 +751,7 @@ void __init __start_xen(unsigned long mb + } + + /* Sanitise the raw E820 map to produce a final clean version. */ +- max_page = init_e820(memmap_type, e820_raw, &e820_raw_nr); ++ max_page = raw_max_page = init_e820(memmap_type, e820_raw, &e820_raw_nr); + + /* Create a temporary copy of the E820 map. */ + memcpy(&boot_e820, &e820, sizeof(e820)); +@@ -820,7 +820,10 @@ void __init __start_xen(unsigned long mb + (end - s) >> PAGE_SHIFT, PAGE_HYPERVISOR); + } + +- e = min_t(uint64_t, e, 1ULL << (PAGE_SHIFT + 32)); ++ if ( e > min(HYPERVISOR_VIRT_END - DIRECTMAP_VIRT_START, ++ 1UL << (PAGE_SHIFT + 32)) ) ++ e = min(HYPERVISOR_VIRT_END - DIRECTMAP_VIRT_START, ++ 1UL << (PAGE_SHIFT + 32)); + #define reloc_size ((__pa(&_end) + mask) & ~mask) + /* Is the region suitable for relocating Xen? */ + if ( !xen_phys_start && e <= limit ) +@@ -969,7 +972,7 @@ void __init __start_xen(unsigned long mb + /* Late kexec reservation (dynamic start address). */ + kexec_reserve_area(&boot_e820); + +- setup_max_pdx(); ++ setup_max_pdx(raw_max_page); + if ( highmem_start ) + xenheap_max_mfn(PFN_DOWN(highmem_start)); + +@@ -995,7 +998,7 @@ void __init __start_xen(unsigned long mb + { + acpi_boot_table_init_done = 1; + srat_parse_regions(s); +- setup_max_pdx(); ++ setup_max_pdx(raw_max_page); + } + + if ( pfn_to_pdx((e - 1) >> PAGE_SHIFT) >= max_pdx ) +@@ -1133,7 +1136,7 @@ void __init __start_xen(unsigned long mb + + acpi_numa_init(); + +- numa_initmem_init(0, max_page); ++ numa_initmem_init(0, raw_max_page); + + end_boot_allocator(); + system_state = SYS_STATE_boot; diff --git a/5231e090-libxc-x86-fix-page-table-creation-for-huge-guests.patch b/5231e090-libxc-x86-fix-page-table-creation-for-huge-guests.patch new file mode 100644 index 0000000..42037ad --- /dev/null +++ b/5231e090-libxc-x86-fix-page-table-creation-for-huge-guests.patch @@ -0,0 +1,94 @@ +# Commit 06d086832155fc7f5344e9d108b979de34674d11 +# Date 2013-09-12 17:41:04 +0200 +# Author Jan Beulich +# Committer Jan Beulich +libxc/x86: fix page table creation for huge guests + +The switch-over logic from one page directory to the next was wrong; +it needs to be deferred until we actually reach the last page within +a given region, instead of being done when the last entry of a page +directory gets started with. + +Signed-off-by: Jan Beulich +Tested-by: Konrad Rzeszutek Wilk +Acked-by: Ian Jackson + +--- a/tools/libxc/xc_dom_x86.c ++++ b/tools/libxc/xc_dom_x86.c +@@ -251,7 +251,7 @@ static int setup_pgtables_x86_32_pae(str + l3_pgentry_64_t *l3tab; + l2_pgentry_64_t *l2tab = NULL; + l1_pgentry_64_t *l1tab = NULL; +- unsigned long l3off, l2off, l1off; ++ unsigned long l3off, l2off = 0, l1off; + xen_vaddr_t addr; + xen_pfn_t pgpfn; + xen_pfn_t l3mfn = xc_dom_p2m_guest(dom, l3pfn); +@@ -299,8 +299,6 @@ static int setup_pgtables_x86_32_pae(str + l2off = l2_table_offset_pae(addr); + l2tab[l2off] = + pfn_to_paddr(xc_dom_p2m_guest(dom, l1pfn)) | L2_PROT; +- if ( l2off == (L2_PAGETABLE_ENTRIES_PAE - 1) ) +- l2tab = NULL; + l1pfn++; + } + +@@ -312,8 +310,13 @@ static int setup_pgtables_x86_32_pae(str + if ( (addr >= dom->pgtables_seg.vstart) && + (addr < dom->pgtables_seg.vend) ) + l1tab[l1off] &= ~_PAGE_RW; /* page tables are r/o */ ++ + if ( l1off == (L1_PAGETABLE_ENTRIES_PAE - 1) ) ++ { + l1tab = NULL; ++ if ( l2off == (L2_PAGETABLE_ENTRIES_PAE - 1) ) ++ l2tab = NULL; ++ } + } + + if ( dom->virt_pgtab_end <= 0xc0000000 ) +@@ -360,7 +363,7 @@ static int setup_pgtables_x86_64(struct + l3_pgentry_64_t *l3tab = NULL; + l2_pgentry_64_t *l2tab = NULL; + l1_pgentry_64_t *l1tab = NULL; +- uint64_t l4off, l3off, l2off, l1off; ++ uint64_t l4off, l3off = 0, l2off = 0, l1off; + uint64_t addr; + xen_pfn_t pgpfn; + +@@ -391,8 +394,6 @@ static int setup_pgtables_x86_64(struct + l3off = l3_table_offset_x86_64(addr); + l3tab[l3off] = + pfn_to_paddr(xc_dom_p2m_guest(dom, l2pfn)) | L3_PROT; +- if ( l3off == (L3_PAGETABLE_ENTRIES_X86_64 - 1) ) +- l3tab = NULL; + l2pfn++; + } + +@@ -405,8 +406,6 @@ static int setup_pgtables_x86_64(struct + l2off = l2_table_offset_x86_64(addr); + l2tab[l2off] = + pfn_to_paddr(xc_dom_p2m_guest(dom, l1pfn)) | L2_PROT; +- if ( l2off == (L2_PAGETABLE_ENTRIES_X86_64 - 1) ) +- l2tab = NULL; + l1pfn++; + } + +@@ -418,8 +417,17 @@ static int setup_pgtables_x86_64(struct + if ( (addr >= dom->pgtables_seg.vstart) && + (addr < dom->pgtables_seg.vend) ) + l1tab[l1off] &= ~_PAGE_RW; /* page tables are r/o */ ++ + if ( l1off == (L1_PAGETABLE_ENTRIES_X86_64 - 1) ) ++ { + l1tab = NULL; ++ if ( l2off == (L2_PAGETABLE_ENTRIES_X86_64 - 1) ) ++ { ++ l2tab = NULL; ++ if ( l3off == (L3_PAGETABLE_ENTRIES_X86_64 - 1) ) ++ l3tab = NULL; ++ } ++ } + } + return 0; + diff --git a/5231f00c-cpufreq-missing-check-of-copy_from_guest.patch b/5231f00c-cpufreq-missing-check-of-copy_from_guest.patch new file mode 100644 index 0000000..5ab3a0d --- /dev/null +++ b/5231f00c-cpufreq-missing-check-of-copy_from_guest.patch @@ -0,0 +1,30 @@ +# Commit 803f9a6cdfeda64beee908576de0ad02d6b0c480 +# Date 2013-09-12 17:47:08 +0100 +# Author Tim Deegan +# Committer Tim Deegan +cpufreq: missing check of copy_from_guest() + +Coverity CID 1055131 +Coverity CID 1055132 + +Signed-off-by: Tim Deegan +Reviewed-by: Andrew Cooper +Acked-by: Jan Beulich + +--- a/xen/drivers/cpufreq/cpufreq.c ++++ b/xen/drivers/cpufreq/cpufreq.c +@@ -471,8 +471,12 @@ int set_px_pminfo(uint32_t acpi_id, stru + ret = -ENOMEM; + goto out; + } +- copy_from_guest(pxpt->states, dom0_px_info->states, +- dom0_px_info->state_count); ++ if ( copy_from_guest(pxpt->states, dom0_px_info->states, ++ dom0_px_info->state_count) ) ++ { ++ ret = -EFAULT; ++ goto out; ++ } + pxpt->state_count = dom0_px_info->state_count; + + if ( cpufreq_verbose ) diff --git a/523304b6-x86-machine_restart-must-not-call-acpi_dmar_reinstate-twice.patch b/523304b6-x86-machine_restart-must-not-call-acpi_dmar_reinstate-twice.patch new file mode 100644 index 0000000..5485a8f --- /dev/null +++ b/523304b6-x86-machine_restart-must-not-call-acpi_dmar_reinstate-twice.patch @@ -0,0 +1,40 @@ +# Commit a54dc5f4fe1eae6b1beb21326ef0338cd3969cd1 +# Date 2013-09-13 14:27:34 +0200 +# Author Jan Beulich +# Committer Jan Beulich +x86: machine_restart() must not call acpi_dmar_reinstate() twice + +.. as that function is not idempotent (it always alters the table +checksum). The (generally) duplicate call was a result from it being +made before machine_restart() re-invoking itself on the boot CPU. + +Considering that no problem arose so far from the table corruption I +doubt that we need to restore the correct table signature on the +reboot path in general. The only case I can see this as potentially +necessary is the tboot one, hence do the call just in that case. + +Signed-off-by: Jan Beulich +Acked-by: Keir Fraser + +--- a/xen/arch/x86/shutdown.c ++++ b/xen/arch/x86/shutdown.c +@@ -115,8 +115,6 @@ void machine_restart(unsigned int delay_ + console_start_sync(); + spin_debug_disable(); + +- acpi_dmar_reinstate(); +- + local_irq_enable(); + + /* Ensure we are the boot CPU. */ +@@ -141,7 +139,10 @@ void machine_restart(unsigned int delay_ + mdelay(delay_millisecs); + + if ( tboot_in_measured_env() ) ++ { ++ acpi_dmar_reinstate(); + tboot_shutdown(TB_SHUTDOWN_REBOOT); ++ } + + efi_reset_system(reboot_mode != 0); + diff --git a/5239a064-x86-HVM-fix-failure-path-in-hvm_vcpu_initialise.patch b/5239a064-x86-HVM-fix-failure-path-in-hvm_vcpu_initialise.patch new file mode 100644 index 0000000..7a523e4 --- /dev/null +++ b/5239a064-x86-HVM-fix-failure-path-in-hvm_vcpu_initialise.patch @@ -0,0 +1,29 @@ +# Commit 925fbcb7fdd6238f26b1576dc1f3e297f1f24f1e +# Date 2013-09-18 14:45:24 +0200 +# Author George Dunlap +# Committer Jan Beulich +x86/HVM: fix failure path in hvm_vcpu_initialise + +It looks like one of the failure cases in hvm_vcpu_initialise jumps to +the wrong label; this could lead to slow leaks if something isn't +cleaned up properly. + +I will probably change these labels in a future patch, but I figured +it was better to have this fix separately. + +This is also a candidate for backport. + +Signed-off-by: George Dunlap +Signed-off-by: Mukesh Rathor + +--- a/xen/arch/x86/hvm/hvm.c ++++ b/xen/arch/x86/hvm/hvm.c +@@ -1125,7 +1125,7 @@ int hvm_vcpu_initialise(struct vcpu *v) + /* Create bufioreq event channel. */ + rc = alloc_unbound_xen_event_channel(v, dm_domid, NULL); + if ( rc < 0 ) +- goto fail2; ++ goto fail4; + d->arch.hvm_domain.params[HVM_PARAM_BUFIOREQ_EVTCHN] = rc; + } + diff --git a/5239a076-VMX-fix-failure-path-in-construct_vmcs.patch b/5239a076-VMX-fix-failure-path-in-construct_vmcs.patch new file mode 100644 index 0000000..111cbbb --- /dev/null +++ b/5239a076-VMX-fix-failure-path-in-construct_vmcs.patch @@ -0,0 +1,26 @@ +# Commit dad7e45bf44c0569546a3ed7d0fa4182a4a73f0a +# Date 2013-09-18 14:45:42 +0200 +# Author George Dunlap +# Committer Jan Beulich +VMX: fix failure path in construct_vmcs + +If the allocation fails, make sure to call vmx_vmcs_exit(). + +This is a candidate for backport. + +Signed-off-by: George Dunlap +Signed-off-by: Mukesh Rathor + +--- a/xen/arch/x86/hvm/vmx/vmcs.c ++++ b/xen/arch/x86/hvm/vmx/vmcs.c +@@ -894,7 +894,10 @@ static int construct_vmcs(struct vcpu *v + unsigned long *msr_bitmap = alloc_xenheap_page(); + + if ( msr_bitmap == NULL ) ++ { ++ vmx_vmcs_exit(v); + return -ENOMEM; ++ } + + memset(msr_bitmap, ~0, PAGE_SIZE); + v->arch.hvm_vmx.msr_bitmap = msr_bitmap; diff --git a/523c0ed4-x86-HVM-properly-handle-wide-MMIO.patch b/523c0ed4-x86-HVM-properly-handle-wide-MMIO.patch new file mode 100644 index 0000000..dd1464f --- /dev/null +++ b/523c0ed4-x86-HVM-properly-handle-wide-MMIO.patch @@ -0,0 +1,184 @@ +# Commit 3b89f08a498ddac09d4002d9849e329018ceb107 +# Date 2013-09-20 11:01:08 +0200 +# Author Jan Beulich +# Committer Jan Beulich +x86/HVM: properly handle MMIO reads and writes wider than a machine word + +Just like real hardware we ought to split such accesses transparently +to the caller. With little extra effort we can at once even handle page +crossing accesses correctly. + +Signed-off-by: Jan Beulich +Acked-by: Keir Fraser + +--- a/xen/arch/x86/hvm/emulate.c ++++ b/xen/arch/x86/hvm/emulate.c +@@ -438,6 +438,7 @@ static int __hvmemul_read( + { + struct vcpu *curr = current; + unsigned long addr, reps = 1; ++ unsigned int off, chunk = min(bytes, 1U << LONG_BYTEORDER); + uint32_t pfec = PFEC_page_present; + struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io; + paddr_t gpa; +@@ -447,16 +448,38 @@ static int __hvmemul_read( + seg, offset, bytes, &reps, access_type, hvmemul_ctxt, &addr); + if ( rc != X86EMUL_OKAY ) + return rc; ++ off = addr & (PAGE_SIZE - 1); ++ /* ++ * We only need to handle sizes actual instruction operands can have. All ++ * such sizes are either powers of 2 or the sum of two powers of 2. Thus ++ * picking as initial chunk size the largest power of 2 not greater than ++ * the total size will always result in only power-of-2 size requests ++ * issued to hvmemul_do_mmio() (hvmemul_do_io() rejects non-powers-of-2). ++ */ ++ while ( chunk & (chunk - 1) ) ++ chunk &= chunk - 1; ++ if ( off + bytes > PAGE_SIZE ) ++ while ( off & (chunk - 1) ) ++ chunk >>= 1; + + if ( unlikely(vio->mmio_gva == (addr & PAGE_MASK)) && vio->mmio_gva ) + { +- unsigned int off = addr & (PAGE_SIZE - 1); + if ( access_type == hvm_access_insn_fetch ) + return X86EMUL_UNHANDLEABLE; + gpa = (((paddr_t)vio->mmio_gpfn << PAGE_SHIFT) | off); +- if ( (off + bytes) <= PAGE_SIZE ) +- return hvmemul_do_mmio(gpa, &reps, bytes, 0, +- IOREQ_READ, 0, p_data); ++ while ( (off + chunk) <= PAGE_SIZE ) ++ { ++ rc = hvmemul_do_mmio(gpa, &reps, chunk, 0, IOREQ_READ, 0, p_data); ++ if ( rc != X86EMUL_OKAY || bytes == chunk ) ++ return rc; ++ addr += chunk; ++ off += chunk; ++ gpa += chunk; ++ p_data += chunk; ++ bytes -= chunk; ++ if ( bytes < chunk ) ++ chunk = bytes; ++ } + } + + if ( (seg != x86_seg_none) && +@@ -473,14 +496,32 @@ static int __hvmemul_read( + return X86EMUL_EXCEPTION; + case HVMCOPY_unhandleable: + return X86EMUL_UNHANDLEABLE; +- case HVMCOPY_bad_gfn_to_mfn: ++ case HVMCOPY_bad_gfn_to_mfn: + if ( access_type == hvm_access_insn_fetch ) + return X86EMUL_UNHANDLEABLE; +- rc = hvmemul_linear_to_phys( +- addr, &gpa, bytes, &reps, pfec, hvmemul_ctxt); +- if ( rc != X86EMUL_OKAY ) +- return rc; +- return hvmemul_do_mmio(gpa, &reps, bytes, 0, IOREQ_READ, 0, p_data); ++ rc = hvmemul_linear_to_phys(addr, &gpa, chunk, &reps, pfec, ++ hvmemul_ctxt); ++ while ( rc == X86EMUL_OKAY ) ++ { ++ rc = hvmemul_do_mmio(gpa, &reps, chunk, 0, IOREQ_READ, 0, p_data); ++ if ( rc != X86EMUL_OKAY || bytes == chunk ) ++ break; ++ addr += chunk; ++ off += chunk; ++ p_data += chunk; ++ bytes -= chunk; ++ if ( bytes < chunk ) ++ chunk = bytes; ++ if ( off < PAGE_SIZE ) ++ gpa += chunk; ++ else ++ { ++ rc = hvmemul_linear_to_phys(addr, &gpa, chunk, &reps, pfec, ++ hvmemul_ctxt); ++ off = 0; ++ } ++ } ++ return rc; + case HVMCOPY_gfn_paged_out: + return X86EMUL_RETRY; + case HVMCOPY_gfn_shared: +@@ -537,6 +578,7 @@ static int hvmemul_write( + container_of(ctxt, struct hvm_emulate_ctxt, ctxt); + struct vcpu *curr = current; + unsigned long addr, reps = 1; ++ unsigned int off, chunk = min(bytes, 1U << LONG_BYTEORDER); + uint32_t pfec = PFEC_page_present | PFEC_write_access; + struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io; + paddr_t gpa; +@@ -546,14 +588,30 @@ static int hvmemul_write( + seg, offset, bytes, &reps, hvm_access_write, hvmemul_ctxt, &addr); + if ( rc != X86EMUL_OKAY ) + return rc; ++ off = addr & (PAGE_SIZE - 1); ++ /* See the respective comment in __hvmemul_read(). */ ++ while ( chunk & (chunk - 1) ) ++ chunk &= chunk - 1; ++ if ( off + bytes > PAGE_SIZE ) ++ while ( off & (chunk - 1) ) ++ chunk >>= 1; + + if ( unlikely(vio->mmio_gva == (addr & PAGE_MASK)) && vio->mmio_gva ) + { +- unsigned int off = addr & (PAGE_SIZE - 1); + gpa = (((paddr_t)vio->mmio_gpfn << PAGE_SHIFT) | off); +- if ( (off + bytes) <= PAGE_SIZE ) +- return hvmemul_do_mmio(gpa, &reps, bytes, 0, +- IOREQ_WRITE, 0, p_data); ++ while ( (off + chunk) <= PAGE_SIZE ) ++ { ++ rc = hvmemul_do_mmio(gpa, &reps, chunk, 0, IOREQ_WRITE, 0, p_data); ++ if ( rc != X86EMUL_OKAY || bytes == chunk ) ++ return rc; ++ addr += chunk; ++ off += chunk; ++ gpa += chunk; ++ p_data += chunk; ++ bytes -= chunk; ++ if ( bytes < chunk ) ++ chunk = bytes; ++ } + } + + if ( (seg != x86_seg_none) && +@@ -569,12 +627,29 @@ static int hvmemul_write( + case HVMCOPY_unhandleable: + return X86EMUL_UNHANDLEABLE; + case HVMCOPY_bad_gfn_to_mfn: +- rc = hvmemul_linear_to_phys( +- addr, &gpa, bytes, &reps, pfec, hvmemul_ctxt); +- if ( rc != X86EMUL_OKAY ) +- return rc; +- return hvmemul_do_mmio(gpa, &reps, bytes, 0, +- IOREQ_WRITE, 0, p_data); ++ rc = hvmemul_linear_to_phys(addr, &gpa, chunk, &reps, pfec, ++ hvmemul_ctxt); ++ while ( rc == X86EMUL_OKAY ) ++ { ++ rc = hvmemul_do_mmio(gpa, &reps, chunk, 0, IOREQ_WRITE, 0, p_data); ++ if ( rc != X86EMUL_OKAY || bytes == chunk ) ++ break; ++ addr += chunk; ++ off += chunk; ++ p_data += chunk; ++ bytes -= chunk; ++ if ( bytes < chunk ) ++ chunk = bytes; ++ if ( off < PAGE_SIZE ) ++ gpa += chunk; ++ else ++ { ++ rc = hvmemul_linear_to_phys(addr, &gpa, chunk, &reps, pfec, ++ hvmemul_ctxt); ++ off = 0; ++ } ++ } ++ return rc; + case HVMCOPY_gfn_paged_out: + return X86EMUL_RETRY; + case HVMCOPY_gfn_shared: diff --git a/523c1758-sched_credit-filter-node-affinity-mask-against-online-cpus.patch b/523c1758-sched_credit-filter-node-affinity-mask-against-online-cpus.patch new file mode 100644 index 0000000..03d7bc4 --- /dev/null +++ b/523c1758-sched_credit-filter-node-affinity-mask-against-online-cpus.patch @@ -0,0 +1,155 @@ +# Commit 5e5a44b6c942d6ea47f15d6f1ed02b03e0d69445 +# Date 2013-09-20 11:37:28 +0200 +# Author Dario Faggioli +# Committer Jan Beulich +sched_credit: filter node-affinity mask against online cpus + +in _csched_cpu_pick(), as not doing so may result in the domain's +node-affinity mask (as retrieved by csched_balance_cpumask() ) +and online mask (as retrieved by cpupool_scheduler_cpumask() ) +having an empty intersection. + +Therefore, when attempting a node-affinity load balancing step +and running this: + + ... + /* Pick an online CPU from the proper affinity mask */ + csched_balance_cpumask(vc, balance_step, &cpus); + cpumask_and(&cpus, &cpus, online); + ... + +we end up with an empty cpumask (in cpus). At this point, in +the following code: + + .... + /* If present, prefer vc's current processor */ + cpu = cpumask_test_cpu(vc->processor, &cpus) + ? vc->processor + : cpumask_cycle(vc->processor, &cpus); + .... + +an ASSERT (from inside cpumask_cycle() ) triggers like this: + +(XEN) Xen call trace: +(XEN) [] _csched_cpu_pick+0x1d2/0x652 +(XEN) [] csched_cpu_pick+0xe/0x10 +(XEN) [] vcpu_migrate+0x167/0x31e +(XEN) [] cpu_disable_scheduler+0x1c8/0x287 +(XEN) [] cpupool_unassign_cpu_helper+0x20/0xb4 +(XEN) [] continue_hypercall_tasklet_handler+0x4a/0xb1 +(XEN) [] do_tasklet_work+0x78/0xab +(XEN) [] do_tasklet+0x5f/0x8b +(XEN) [] idle_loop+0x57/0x5e +(XEN) +(XEN) +(XEN) **************************************** +(XEN) Panic on CPU 1: +(XEN) Assertion 'cpu < nr_cpu_ids' failed at /home/dario/Sources/xen/xen/xen.git/xen/include/xe:16481 + +It is for example sufficient to have a domain with node-affinity +to NUMA node 1 running, and issueing a `xl cpupool-numa-split' +would make the above happen. That is because, by default, all +the existing domains remain assigned to the first cpupool, and +it now (after the cpupool-numa-split) only includes NUMA node 0. + +This change prevents that by generalizing the function used +for figuring out whether a node-affinity load balancing step +is legit or not. This way we can, in _csched_cpu_pick(), +figure out early enough that the mask would end up empty, +skip the step all together and avoid the splat. + +Signed-off-by: Dario Faggioli +Reviewed-by: George Dunlap + +--- a/xen/common/sched_credit.c ++++ b/xen/common/sched_credit.c +@@ -296,15 +296,28 @@ static void csched_set_node_affinity( + * vcpu-affinity balancing is always necessary and must never be skipped. + * OTOH, if a domain's node-affinity is said to be automatically computed + * (or if it just spans all the nodes), we can safely avoid dealing with +- * node-affinity entirely. Ah, node-affinity is also deemed meaningless +- * in case it has empty intersection with the vcpu's vcpu-affinity, as it +- * would mean trying to schedule it on _no_ pcpu! ++ * node-affinity entirely. ++ * ++ * Node-affinity is also deemed meaningless in case it has empty ++ * intersection with mask, to cover the cases where using the node-affinity ++ * mask seems legit, but would instead led to trying to schedule the vcpu ++ * on _no_ pcpu! Typical use cases are for mask to be equal to the vcpu's ++ * vcpu-affinity, or to the && of vcpu-affinity and the set of online cpus ++ * in the domain's cpupool. + */ +-#define __vcpu_has_node_affinity(vc) \ +- ( !(cpumask_full(CSCHED_DOM(vc->domain)->node_affinity_cpumask) \ +- || !cpumask_intersects(vc->cpu_affinity, \ +- CSCHED_DOM(vc->domain)->node_affinity_cpumask) \ +- || vc->domain->auto_node_affinity == 1) ) ++static inline int __vcpu_has_node_affinity(const struct vcpu *vc, ++ const cpumask_t *mask) ++{ ++ const struct domain *d = vc->domain; ++ const struct csched_dom *sdom = CSCHED_DOM(d); ++ ++ if ( d->auto_node_affinity ++ || cpumask_full(sdom->node_affinity_cpumask) ++ || !cpumask_intersects(sdom->node_affinity_cpumask, mask) ) ++ return 0; ++ ++ return 1; ++} + + /* + * Each csched-balance step uses its own cpumask. This function determines +@@ -393,7 +406,8 @@ __runq_tickle(unsigned int cpu, struct c + int new_idlers_empty; + + if ( balance_step == CSCHED_BALANCE_NODE_AFFINITY +- && !__vcpu_has_node_affinity(new->vcpu) ) ++ && !__vcpu_has_node_affinity(new->vcpu, ++ new->vcpu->cpu_affinity) ) + continue; + + /* Are there idlers suitable for new (for this balance step)? */ +@@ -626,11 +640,32 @@ _csched_cpu_pick(const struct scheduler + int cpu = vc->processor; + int balance_step; + ++ /* Store in cpus the mask of online cpus on which the domain can run */ + online = cpupool_scheduler_cpumask(vc->domain->cpupool); ++ cpumask_and(&cpus, vc->cpu_affinity, online); ++ + for_each_csched_balance_step( balance_step ) + { ++ /* ++ * We want to pick up a pcpu among the ones that are online and ++ * can accommodate vc, which is basically what we computed above ++ * and stored in cpus. As far as vcpu-affinity is concerned, ++ * there always will be at least one of these pcpus, hence cpus ++ * is never empty and the calls to cpumask_cycle() and ++ * cpumask_test_cpu() below are ok. ++ * ++ * On the other hand, when considering node-affinity too, it ++ * is possible for the mask to become empty (for instance, if the ++ * domain has been put in a cpupool that does not contain any of the ++ * nodes in its node-affinity), which would result in the ASSERT()-s ++ * inside cpumask_*() operations triggering (in debug builds). ++ * ++ * Therefore, in this case, we filter the node-affinity mask against ++ * cpus and, if the result is empty, we just skip the node-affinity ++ * balancing step all together. ++ */ + if ( balance_step == CSCHED_BALANCE_NODE_AFFINITY +- && !__vcpu_has_node_affinity(vc) ) ++ && !__vcpu_has_node_affinity(vc, &cpus) ) + continue; + + /* Pick an online CPU from the proper affinity mask */ +@@ -1449,7 +1484,7 @@ csched_runq_steal(int peer_cpu, int cpu, + * or counter. + */ + if ( balance_step == CSCHED_BALANCE_NODE_AFFINITY +- && !__vcpu_has_node_affinity(vc) ) ++ && !__vcpu_has_node_affinity(vc, vc->cpu_affinity) ) + continue; + + csched_balance_cpumask(vc, balance_step, csched_balance_mask); diff --git a/disable_emulated_device.patch b/523c1834-unmodified_drivers-enable-unplug-per-default.patch similarity index 52% rename from disable_emulated_device.patch rename to 523c1834-unmodified_drivers-enable-unplug-per-default.patch index 3a5dc09..6016a7a 100644 --- a/disable_emulated_device.patch +++ b/523c1834-unmodified_drivers-enable-unplug-per-default.patch @@ -1,5 +1,8 @@ -From: Olaf Hering -Subject: [PATCH v2] unmodified_drivers: enable unplug per default +# Commit df17e9c889c48c9c10aa3f9dd0bb11077f54efc4 +# Date 2013-09-20 11:41:08 +0200 +# Author Olaf Hering +# Committer Jan Beulich +unmodified_drivers: enable unplug per default Since xen-3.3 an official unplug protocol for emulated hardware is available in the toolstack. The pvops kernel does the unplug per @@ -7,22 +10,19 @@ default, so it is safe to do it also in the drivers for forward ported xenlinux. Currently its required to load xen-platform-pci with the module parameter dev_unplug=all, which is cumbersome. +Also recognize the dev_unplug=never parameter, which provides the +default before this patch. Signed-off-by: Olaf Hering ---- - unmodified_drivers/linux-2.6/platform-pci/platform-pci.c | 6 +++++- - 1 file changed, 5 insertions(+), 1 deletion(-) -Index: xen-4.3.0-testing/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c -=================================================================== ---- xen-4.3.0-testing.orig/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c -+++ xen-4.3.0-testing/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c +--- a/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c ++++ b/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c @@ -66,7 +66,7 @@ MODULE_LICENSE("GPL"); static char *dev_unplug; module_param(dev_unplug, charp, 0644); MODULE_PARM_DESC(dev_unplug, "Emulated devices to unplug: " - "[all,][ide-disks,][aux-ide-disks,][nics]\n"); -+ "[all,][ide-disks,][aux-ide-disks,][nics] (default is 'all')\n"); ++ "[all,][ide-disks,][aux-ide-disks,][nics,][never] (default is 'all')\n"); struct pci_dev *xen_platform_pdev; @@ -37,3 +37,12 @@ Index: xen-4.3.0-testing/unmodified_drivers/linux-2.6/platform-pci/platform-pci. for (p = dev_unplug; p; p = q) { q = strchr(dev_unplug, ','); if (q) +@@ -302,6 +306,8 @@ static int check_platform_magic(struct d + unplug |= UNPLUG_AUX_IDE_DISKS; + else if (!strcmp(p, "nics")) + unplug |= UNPLUG_ALL_NICS; ++ else if (!strcmp(p, "never")) ++ unplug = 0; + else + dev_warn(dev, "unrecognised option '%s' " + "in module parameter 'dev_unplug'\n", p); diff --git a/523ff393-x86-HVM-linear-address-must-be-canonical-for-the-whole-accessed-range.patch b/523ff393-x86-HVM-linear-address-must-be-canonical-for-the-whole-accessed-range.patch new file mode 100644 index 0000000..dd5fee5 --- /dev/null +++ b/523ff393-x86-HVM-linear-address-must-be-canonical-for-the-whole-accessed-range.patch @@ -0,0 +1,92 @@ +# Commit 7f12732670b31b2fea899a4160d455574658474f +# Date 2013-09-23 09:53:55 +0200 +# Author Jan Beulich +# Committer Jan Beulich +x86/HVM: linear address must be canonical for the whole accessed range + +... rather than just for the first byte. + +While at it, also +- make the real mode case at least dpo a wrap around check +- drop the mis-named "gpf" label (we're not generating faults here) + and use in-place returns instead + +Signed-off-by: Jan Beulich +Acked-by: Keir Fraser + +--- a/xen/arch/x86/hvm/hvm.c ++++ b/xen/arch/x86/hvm/hvm.c +@@ -1938,8 +1938,7 @@ int hvm_virtual_to_linear_addr( + unsigned int addr_size, + unsigned long *linear_addr) + { +- unsigned long addr = offset; +- uint32_t last_byte; ++ unsigned long addr = offset, last_byte; + + if ( !(current->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) ) + { +@@ -1948,6 +1947,9 @@ int hvm_virtual_to_linear_addr( + * Certain of them are not done in native real mode anyway. + */ + addr = (uint32_t)(addr + reg->base); ++ last_byte = (uint32_t)addr + bytes - 1; ++ if ( last_byte < addr ) ++ return 0; + } + else if ( addr_size != 64 ) + { +@@ -1959,17 +1961,17 @@ int hvm_virtual_to_linear_addr( + { + case hvm_access_read: + if ( (reg->attr.fields.type & 0xa) == 0x8 ) +- goto gpf; /* execute-only code segment */ ++ return 0; /* execute-only code segment */ + break; + case hvm_access_write: + if ( (reg->attr.fields.type & 0xa) != 0x2 ) +- goto gpf; /* not a writable data segment */ ++ return 0; /* not a writable data segment */ + break; + default: + break; + } + +- last_byte = offset + bytes - 1; ++ last_byte = (uint32_t)offset + bytes - 1; + + /* Is this a grows-down data segment? Special limit check if so. */ + if ( (reg->attr.fields.type & 0xc) == 0x4 ) +@@ -1980,10 +1982,10 @@ int hvm_virtual_to_linear_addr( + + /* Check first byte and last byte against respective bounds. */ + if ( (offset <= reg->limit) || (last_byte < offset) ) +- goto gpf; ++ return 0; + } + else if ( (last_byte > reg->limit) || (last_byte < offset) ) +- goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */ ++ return 0; /* last byte is beyond limit or wraps 0xFFFFFFFF */ + + /* + * Hardware truncates to 32 bits in compatibility mode. +@@ -2000,15 +2002,14 @@ int hvm_virtual_to_linear_addr( + if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) ) + addr += reg->base; + +- if ( !is_canonical_address(addr) ) +- goto gpf; ++ last_byte = addr + bytes - 1; ++ if ( !is_canonical_address(addr) || last_byte < addr || ++ !is_canonical_address(last_byte) ) ++ return 0; + } + + *linear_addr = addr; + return 1; +- +- gpf: +- return 0; + } + + /* On non-NULL return, we leave this function holding an additional diff --git a/523ff3e2-x86-HVM-refuse-doing-string-operations-in-certain-situations.patch b/523ff3e2-x86-HVM-refuse-doing-string-operations-in-certain-situations.patch new file mode 100644 index 0000000..7c1eeca --- /dev/null +++ b/523ff3e2-x86-HVM-refuse-doing-string-operations-in-certain-situations.patch @@ -0,0 +1,65 @@ +# Commit 14fcce2fa883405bab26b60821a6cc5f2c770833 +# Date 2013-09-23 09:55:14 +0200 +# Author Jan Beulich +# Committer Jan Beulich +x86/HVM: refuse doing string operations in certain situations + +We shouldn't do any acceleration for +- "rep movs" when either side is passed through MMIO or when both sides + are handled by qemu +- "rep ins" and "rep outs" when the memory operand is any kind of MMIO + +Signed-off-by: Jan Beulich +Acked-by: Keir Fraser + +--- a/xen/arch/x86/hvm/emulate.c ++++ b/xen/arch/x86/hvm/emulate.c +@@ -686,6 +686,7 @@ static int hvmemul_rep_ins( + unsigned long addr; + uint32_t pfec = PFEC_page_present | PFEC_write_access; + paddr_t gpa; ++ p2m_type_t p2mt; + int rc; + + rc = hvmemul_virtual_to_linear( +@@ -702,6 +703,10 @@ static int hvmemul_rep_ins( + if ( rc != X86EMUL_OKAY ) + return rc; + ++ (void) get_gfn_query_unlocked(current->domain, gpa >> PAGE_SHIFT, &p2mt); ++ if ( p2mt == p2m_mmio_direct || p2mt == p2m_mmio_dm ) ++ return X86EMUL_UNHANDLEABLE; ++ + return hvmemul_do_pio(src_port, reps, bytes_per_rep, gpa, IOREQ_READ, + !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL); + } +@@ -719,6 +724,7 @@ static int hvmemul_rep_outs( + unsigned long addr; + uint32_t pfec = PFEC_page_present; + paddr_t gpa; ++ p2m_type_t p2mt; + int rc; + + rc = hvmemul_virtual_to_linear( +@@ -735,6 +741,10 @@ static int hvmemul_rep_outs( + if ( rc != X86EMUL_OKAY ) + return rc; + ++ (void) get_gfn_query_unlocked(current->domain, gpa >> PAGE_SHIFT, &p2mt); ++ if ( p2mt == p2m_mmio_direct || p2mt == p2m_mmio_dm ) ++ return X86EMUL_UNHANDLEABLE; ++ + return hvmemul_do_pio(dst_port, reps, bytes_per_rep, gpa, IOREQ_WRITE, + !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL); + } +@@ -787,6 +797,10 @@ static int hvmemul_rep_movs( + (void) get_gfn_query_unlocked(current->domain, sgpa >> PAGE_SHIFT, &sp2mt); + (void) get_gfn_query_unlocked(current->domain, dgpa >> PAGE_SHIFT, &dp2mt); + ++ if ( sp2mt == p2m_mmio_direct || dp2mt == p2m_mmio_direct || ++ (sp2mt == p2m_mmio_dm && dp2mt == p2m_mmio_dm) ) ++ return X86EMUL_UNHANDLEABLE; ++ + if ( sp2mt == p2m_mmio_dm ) + return hvmemul_do_mmio( + sgpa, reps, bytes_per_rep, dgpa, IOREQ_READ, df, NULL); diff --git a/5242a1b5-x86-xsave-initialize-extended-register-state-when-guests-enable-it.patch b/5242a1b5-x86-xsave-initialize-extended-register-state-when-guests-enable-it.patch new file mode 100644 index 0000000..045e433 --- /dev/null +++ b/5242a1b5-x86-xsave-initialize-extended-register-state-when-guests-enable-it.patch @@ -0,0 +1,52 @@ +References: bnc#839596 CVE-2013-1442 XSA-62 + +# Commit 63a75ba0de817d6f384f96d25427a05c313e2179 +# Date 2013-09-25 10:41:25 +0200 +# Author Jan Beulich +# Committer Jan Beulich +x86/xsave: initialize extended register state when guests enable it + +Till now, when setting previously unset bits in XCR0 we wouldn't touch +the active register state, thus leaving in the newly enabled registers +whatever a prior user of it left there, i.e. potentially leaking +information between guests. + +This is CVE-2013-1442 / XSA-62. + +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper + +--- a/xen/arch/x86/xstate.c ++++ b/xen/arch/x86/xstate.c +@@ -342,6 +342,7 @@ int validate_xstate(u64 xcr0, u64 xcr0_a + int handle_xsetbv(u32 index, u64 new_bv) + { + struct vcpu *curr = current; ++ u64 mask; + + if ( index != XCR_XFEATURE_ENABLED_MASK ) + return -EOPNOTSUPP; +@@ -355,9 +356,23 @@ int handle_xsetbv(u32 index, u64 new_bv) + if ( !set_xcr0(new_bv) ) + return -EFAULT; + ++ mask = new_bv & ~curr->arch.xcr0_accum; + curr->arch.xcr0 = new_bv; + curr->arch.xcr0_accum |= new_bv; + ++ mask &= curr->fpu_dirtied ? ~XSTATE_FP_SSE : XSTATE_NONLAZY; ++ if ( mask ) ++ { ++ unsigned long cr0 = read_cr0(); ++ ++ clts(); ++ if ( curr->fpu_dirtied ) ++ asm ( "stmxcsr %0" : "=m" (curr->arch.xsave_area->fpu_sse.mxcsr) ); ++ xrstor(curr, mask); ++ if ( cr0 & X86_CR0_TS ) ++ write_cr0(cr0); ++ } ++ + return 0; + } + diff --git a/CVE-2013-4355-xsa63.patch b/CVE-2013-4355-xsa63.patch new file mode 100644 index 0000000..48b7c0c --- /dev/null +++ b/CVE-2013-4355-xsa63.patch @@ -0,0 +1,173 @@ +References: bnc#840592 CVE-2013-4355 XSA-63 + +x86: properly handle hvm_copy_from_guest_{phys,virt}() errors + +Ignoring them generally implies using uninitialized data and, in all +but two of the cases dealt with here, potentially leaking hypervisor +stack contents to guests. + +This is CVE-2013-4355 / XSA-63. + +Signed-off-by: Jan Beulich +Reviewed-by: Tim Deegan +Reviewed-by: Andrew Cooper + +--- a/xen/arch/x86/hvm/hvm.c ++++ b/xen/arch/x86/hvm/hvm.c +@@ -2316,11 +2316,7 @@ void hvm_task_switch( + + rc = hvm_copy_from_guest_virt( + &tss, prev_tr.base, sizeof(tss), PFEC_page_present); +- if ( rc == HVMCOPY_bad_gva_to_gfn ) +- goto out; +- if ( rc == HVMCOPY_gfn_paged_out ) +- goto out; +- if ( rc == HVMCOPY_gfn_shared ) ++ if ( rc != HVMCOPY_okay ) + goto out; + + eflags = regs->eflags; +@@ -2365,13 +2361,11 @@ void hvm_task_switch( + + rc = hvm_copy_from_guest_virt( + &tss, tr.base, sizeof(tss), PFEC_page_present); +- if ( rc == HVMCOPY_bad_gva_to_gfn ) +- goto out; +- if ( rc == HVMCOPY_gfn_paged_out ) +- goto out; +- /* Note: this could be optimised, if the callee functions knew we want RO +- * access */ +- if ( rc == HVMCOPY_gfn_shared ) ++ /* ++ * Note: The HVMCOPY_gfn_shared case could be optimised, if the callee ++ * functions knew we want RO access. ++ */ ++ if ( rc != HVMCOPY_okay ) + goto out; + + +--- a/xen/arch/x86/hvm/intercept.c ++++ b/xen/arch/x86/hvm/intercept.c +@@ -87,17 +87,28 @@ static int hvm_mmio_access(struct vcpu * + { + for ( i = 0; i < p->count; i++ ) + { +- int ret; +- +- ret = hvm_copy_from_guest_phys(&data, +- p->data + (sign * i * p->size), +- p->size); +- if ( (ret == HVMCOPY_gfn_paged_out) || +- (ret == HVMCOPY_gfn_shared) ) ++ switch ( hvm_copy_from_guest_phys(&data, ++ p->data + sign * i * p->size, ++ p->size) ) + { ++ case HVMCOPY_okay: ++ break; ++ case HVMCOPY_gfn_paged_out: ++ case HVMCOPY_gfn_shared: + rc = X86EMUL_RETRY; + break; ++ case HVMCOPY_bad_gfn_to_mfn: ++ data = ~0; ++ break; ++ case HVMCOPY_bad_gva_to_gfn: ++ ASSERT(0); ++ /* fall through */ ++ default: ++ rc = X86EMUL_UNHANDLEABLE; ++ break; + } ++ if ( rc != X86EMUL_OKAY ) ++ break; + rc = write_handler(v, p->addr + (sign * i * p->size), p->size, + data); + if ( rc != X86EMUL_OKAY ) +@@ -165,8 +176,28 @@ static int process_portio_intercept(port + for ( i = 0; i < p->count; i++ ) + { + data = 0; +- (void)hvm_copy_from_guest_phys(&data, p->data + sign*i*p->size, +- p->size); ++ switch ( hvm_copy_from_guest_phys(&data, ++ p->data + sign * i * p->size, ++ p->size) ) ++ { ++ case HVMCOPY_okay: ++ break; ++ case HVMCOPY_gfn_paged_out: ++ case HVMCOPY_gfn_shared: ++ rc = X86EMUL_RETRY; ++ break; ++ case HVMCOPY_bad_gfn_to_mfn: ++ data = ~0; ++ break; ++ case HVMCOPY_bad_gva_to_gfn: ++ ASSERT(0); ++ /* fall through */ ++ default: ++ rc = X86EMUL_UNHANDLEABLE; ++ break; ++ } ++ if ( rc != X86EMUL_OKAY ) ++ break; + rc = action(IOREQ_WRITE, p->addr, p->size, &data); + if ( rc != X86EMUL_OKAY ) + break; +--- a/xen/arch/x86/hvm/io.c ++++ b/xen/arch/x86/hvm/io.c +@@ -340,14 +340,24 @@ static int dpci_ioport_write(uint32_t mp + data = p->data; + if ( p->data_is_ptr ) + { +- int ret; +- +- ret = hvm_copy_from_guest_phys(&data, +- p->data + (sign * i * p->size), +- p->size); +- if ( (ret == HVMCOPY_gfn_paged_out) && +- (ret == HVMCOPY_gfn_shared) ) ++ switch ( hvm_copy_from_guest_phys(&data, ++ p->data + sign * i * p->size, ++ p->size) ) ++ { ++ case HVMCOPY_okay: ++ break; ++ case HVMCOPY_gfn_paged_out: ++ case HVMCOPY_gfn_shared: + return X86EMUL_RETRY; ++ case HVMCOPY_bad_gfn_to_mfn: ++ data = ~0; ++ break; ++ case HVMCOPY_bad_gva_to_gfn: ++ ASSERT(0); ++ /* fall through */ ++ default: ++ return X86EMUL_UNHANDLEABLE; ++ } + } + + switch ( p->size ) +--- a/xen/arch/x86/hvm/vmx/realmode.c ++++ b/xen/arch/x86/hvm/vmx/realmode.c +@@ -39,7 +39,9 @@ static void realmode_deliver_exception( + + again: + last_byte = (vector * 4) + 3; +- if ( idtr->limit < last_byte ) ++ if ( idtr->limit < last_byte || ++ hvm_copy_from_guest_phys(&cs_eip, idtr->base + vector * 4, 4) != ++ HVMCOPY_okay ) + { + /* Software interrupt? */ + if ( insn_len != 0 ) +@@ -64,8 +66,6 @@ static void realmode_deliver_exception( + } + } + +- (void)hvm_copy_from_guest_phys(&cs_eip, idtr->base + vector * 4, 4); +- + frame[0] = regs->eip + insn_len; + frame[1] = csr->sel; + frame[2] = regs->eflags & ~X86_EFLAGS_RF; diff --git a/CVE-2013-4356-xsa64.patch b/CVE-2013-4356-xsa64.patch new file mode 100644 index 0000000..5def99b --- /dev/null +++ b/CVE-2013-4356-xsa64.patch @@ -0,0 +1,54 @@ +References: bnc#840593 CVE-2013-4356 XSA-64 + +x86/mm/shadow: Fix initialization of PV shadow L4 tables. + +Shadowed PV L4 tables must have the same Xen mappings as their +unshadowed equivalent. This is done by copying the Xen entries +verbatim from the idle pagetable, and then using guest_l4_slot() +in the SHADOW_FOREACH_L4E() iterator to avoid touching those entries. + +adc5afbf1c70ef55c260fb93e4b8ce5ccb918706 (x86: support up to 16Tb) +changed the definition of ROOT_PAGETABLE_XEN_SLOTS to extend right to +the top of the address space, which causes the shadow code to +copy Xen mappings into guest-kernel-address slots too. + +In the common case, all those slots are zero in the idle pagetable, +and no harm is done. But if any slot above #271 is non-zero, Xen will +crash when that slot is later cleared (it attempts to drop +shadow-pagetable refcounts on its own L4 pagetables). + +Fix by using the new ROOT_PAGETABLE_PV_XEN_SLOTS when appropriate. +Monitor pagetables need the full Xen mappings, so they keep using the +old name (with its new semantics). + +This is CVE-2013-4356 / XSA-64. + +Reported-by: Andrew Cooper +Signed-off-by: Tim Deegan +Tested-by: Andrew Cooper +Reviewed-by: Jan Beulich + +--- a/xen/arch/x86/mm/shadow/multi.c ++++ b/xen/arch/x86/mm/shadow/multi.c +@@ -1433,15 +1433,19 @@ void sh_install_xen_entries_in_l4(struct + { + struct domain *d = v->domain; + shadow_l4e_t *sl4e; ++ unsigned int slots; + + sl4e = sh_map_domain_page(sl4mfn); + ASSERT(sl4e != NULL); + ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t)); +- ++ + /* Copy the common Xen mappings from the idle domain */ ++ slots = (shadow_mode_external(d) ++ ? ROOT_PAGETABLE_XEN_SLOTS ++ : ROOT_PAGETABLE_PV_XEN_SLOTS); + memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT], + &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT], +- ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t)); ++ slots * sizeof(l4_pgentry_t)); + + /* Install the per-domain mappings for this domain */ + sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] = diff --git a/CVE-2013-4361-xsa66.patch b/CVE-2013-4361-xsa66.patch new file mode 100644 index 0000000..aa80323 --- /dev/null +++ b/CVE-2013-4361-xsa66.patch @@ -0,0 +1,25 @@ +References: bnc#841766 CVE-2013-4361 XSA-66 + +x86: properly set up fbld emulation operand address + +This is CVE-2013-4361 / XSA-66. + +Signed-off-by: Jan Beulich +Acked-by: Ian Jackson + +--- a/xen/arch/x86/x86_emulate/x86_emulate.c ++++ b/xen/arch/x86/x86_emulate/x86_emulate.c +@@ -3156,11 +3156,11 @@ x86_emulate( + break; + case 4: /* fbld m80dec */ + ea.bytes = 10; +- dst = ea; ++ src = ea; + if ( (rc = ops->read(src.mem.seg, src.mem.off, + &src.val, src.bytes, ctxt)) != 0 ) + goto done; +- emulate_fpu_insn_memdst("fbld", src.val); ++ emulate_fpu_insn_memsrc("fbld", src.val); + break; + case 5: /* fild m64i */ + ea.bytes = 8; diff --git a/block-dmmd b/block-dmmd index c386392..4635d90 100644 --- a/block-dmmd +++ b/block-dmmd @@ -2,13 +2,22 @@ # Usage: block-dmmd [add args | remove args] # -# the xm config file should have something like: +# the dmmd device syntax (in xm commands/configs) is something like: # dmmd:md;/dev/md0;md;/dev/md1;lvm;/dev/vg1/lv1 # or # dmmd:lvm;/dev/vg1/lv1;lvm;/dev/vg1/lv2;md;/dev/md0 -# note the last device will be used for VM - +# device pairs (type;dev) are processed in order, with the last device +# assigned to the VM +# +# md devices can optionally: +# specify a config file through: +# md;/dev/md100(/var/xen/config/mdadm.conf) +# use an array name (mdadm -N option): +# dmmd:md;My-MD-name;lvm;/dev/vg1/lv1 +# # History: +# 2013-07-03, loic.devulder@mpsa.com: +# Partial rewrite of the script for supporting MD activation by name # 2009-06-09, mh@novell.com: # Emit debugging messages into a temporary file; if no longer needed, # just comment the exec I/O redirection below @@ -39,7 +48,7 @@ function run_mdadm() local msg local rc - msg="`/sbin/mdadm $mdadm_cmd 2>&1`" + msg="$(/sbin/mdadm $mdadm_cmd 2>&1)" rc=$? case "$msg" in *"has been started"* | *"already active"* ) @@ -59,11 +68,12 @@ function run_mdadm() function activate_md() { + # Make it explicitly local local par=$1 - local already_active=0 cfg dev rc t + local cfg dev dev_path rc t mdadm_opts if [ ${par} = ${par%%(*} ]; then - # No configuration file specified: + # No configuration file specified dev=$par cfg= else @@ -71,23 +81,50 @@ function activate_md() t=${par#*(} cfg="-c ${t%%)*}" fi - if /sbin/mdadm -Q -D $dev; then - already_active=1 + + # Looking for device name or aliase + if [ ${dev:0:1} = / ]; then + dev_path=${dev%/*} + mdadm_opts= + else + dev_path=/dev/md + mdadm_opts="-s -N" fi - run_mdadm "-A $dev $cfg" + + # Is md device already active? + # We need to use full path name, aliase is not possible... + /sbin/mdadm -Q -D $dev_path/${dev##*/} > /dev/null 2>&1 \ + && return 0 + + run_mdadm "-A $mdadm_opts $dev $cfg" rc=$? - if [ $already_active -eq 1 ] && [ $rc -eq 2 ]; then - return 0 - fi + [ $rc -eq 2 ] && return 0 + return $rc } function deactivate_md() { - local par=$1 # Make it explicitly local + local par=$1 + local dev + + if [ ${par} = ${par%%(*} ]; then + # No configuration file specified + dev=${par} + else + dev=${par%%(*} + fi + + # Looking for device name or aliase + if [ ${dev:0:1} = / ]; then + dev_path=${dev%/*} + else + dev_path=/dev/md + fi + + # We need the device name only while deactivating + /sbin/mdadm -S ${dev_path}/${dev##*/} > /dev/null 2>&1 - ## We need the device name only while deactivating - /sbin/mdadm -S ${par%%(*} return $? } @@ -99,14 +136,20 @@ function activate_lvm() # Parse device-create-timeout from /etc/xen/xend-config.sxp # If not set, use default timeout of 90s - parsed_timeout=$(grep -v "^[ \t]*#.*" /etc/xen/xend-config.sxp|sed -n 's/(device-create-timeout \+\([0-9]\+\))/\1/p') - if [ ! -z $parsed_timeout ]; then - run_timeout=$((${parsed_timeout}*9/10)) - fi + parsed_timeout=$(grep -v "^[ \t]*#.*" /etc/xen/xend-config.sxp \ + | sed -n 's/(device-create-timeout \+\([0-9]\+\))/\1/p') + [ ! -z $parsed_timeout ] \ + && run_timeout=$((${parsed_timeout}*9/10)) + + # First scan for PVs and VGs + # We need this for using md device as PV + /sbin/pvscan > /dev/null 2>&1 +# /sbin/vgscan --mknodes > /dev/null 2>&1 end_time=$(($(date +%s)+${run_timeout})) while true; do - /sbin/lvchange -aey $1 + /sbin/lvchange -aey $1 > /dev/null 2>&1 + if [ $? -eq 0 -a -e $1 ]; then return 0 fi @@ -122,7 +165,8 @@ function activate_lvm() function deactivate_lvm() { - /sbin/lvchange -aen $1 + /sbin/lvchange -aen $1 > /dev/null 2>&1 + if [ $? -eq 0 ]; then # We may have to deactivate the VG now, but can ignore errors: # /sbin/vgchange -an ${1%/*} || : @@ -227,7 +271,6 @@ function parse_par() fi fi push "$t $s" - done } @@ -246,11 +289,11 @@ case "$command" in fi lastparam=${dmmd##*;} usedevice=${lastparam%(*} - xenstore-write $XENBUS_PATH/node "$usedevice" - write_dev "$usedevice" - release_lock "dmmd" - exit 0 - ;; + xenstore-write $XENBUS_PATH/node "$usedevice" + write_dev "$usedevice" + release_lock "dmmd" + exit 0 + ;; remove) p=`xenstore-read $XENBUS_PATH/params` || true diff --git a/magic_ioport_compat.patch b/magic_ioport_compat.patch index d89f355..5925e54 100644 --- a/magic_ioport_compat.patch +++ b/magic_ioport_compat.patch @@ -4,7 +4,7 @@ Signed-off-by: K. Y. Srinivasan --- a/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c +++ b/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c -@@ -316,7 +316,10 @@ static int check_platform_magic(struct d +@@ -322,7 +322,10 @@ static int check_platform_magic(struct d if (magic != XEN_IOPORT_MAGIC_VAL) { err = "unrecognised magic value"; diff --git a/supported_module.patch b/supported_module.patch index 05b7a48..9010b3a 100644 --- a/supported_module.patch +++ b/supported_module.patch @@ -6,10 +6,11 @@ Index: xen-4.2.0-testing/unmodified_drivers/linux-2.6/Module.supported =================================================================== --- /dev/null +++ xen-4.2.0-testing/unmodified_drivers/linux-2.6/Module.supported -@@ -0,0 +1,6 @@ +@@ -0,0 +1,7 @@ +xen-vbd +xen-platform-pci +xen-vnif +xenbus +xen-balloon +xen-scsi ++xen-usb diff --git a/xen.changes b/xen.changes index 7f62ee5..01c360c 100644 --- a/xen.changes +++ b/xen.changes @@ -1,3 +1,68 @@ +------------------------------------------------------------------- +Wed Oct 2 15:58:47 MDT 2013 - jfehlig@suse.com + +- Improvements to block-dmmd script + bnc#828623 + +------------------------------------------------------------------- +Mon Sep 30 10:48:29 MDT 2013 - carnold@suse.com + +- bnc#839596 - VUL-0: CVE-2013-1442: XSA-62: xen: Information leak + on AVX and/or LWP capable CPUs + 5242a1b5-x86-xsave-initialize-extended-register-state-when-guests-enable-it.patch +- bnc#840592 - VUL-0: CVE-2013-4355: XSA-63: xen: Information leaks + through I/O instruction emulation + CVE-2013-4355-xsa63.patch +- bnc#840593 - VUL-0: CVE-2013-4356: XSA-64: xen: Memory accessible + by 64-bit PV guests under live migration + CVE-2013-4356-xsa64.patch +- bnc#841766 - VUL-1: CVE-2013-4361: XSA-66: xen: Information leak + through fbld instruction emulation + CVE-2013-4361-xsa66.patch +- bnc#833796 - L3: Xen: migration broken from xsave-capable to + xsave-incapable host + 52205e27-x86-xsave-initialization-improvements.patch + 522dc0e6-x86-xsave-fix-migration-from-xsave-capable-to-xsave-incapable-host.patch +- bnc#839600 - [HP BCS SLES11 Bug]: In HP’s UEFI x86_64 platform and + sles11sp3 with xen environment, xen hypervisor will panic on + multiple blades nPar. + 523172d5-x86-fix-memory-cut-off-when-using-PFN-compression.patch +- bnc#833251 - [HP BCS SLES11 Bug]: In HP’s UEFI x86_64 platform + and with xen environment, in booting stage ,xen hypervisor will + panic. + 522d896b-x86-EFI-properly-handle-run-time-memory-regions-outside-the-1-1-map.patch +- bnc#834751 - [HP BCS SLES11 Bug]: In xen, “shutdown –y 0 –h” + cannot power off system + 522d896b-x86-EFI-properly-handle-run-time-memory-regions-outside-the-1-1-map.patch +- Upstream patches from Jan + 520119fc-xen-conring-Write-to-console-ring-even-if-console-lock-is-busted.patch + 520a2705-watchdog-crash-Always-disable-watchdog-in-console_force_unlock.patch + 522d8a1f-x86-allow-guest-to-set-clear-MSI-X-mask-bit-try-2.patch + 522dc044-xmalloc-make-whole-pages-xfree-clear-the-order-field-ab-used-by-xmalloc.patch + 522f2f9f-Nested-VMX-Clear-bit-31-of-IA32_VMX_BASIC-MSR.patch + 522f37b2-sched-arinc653-check-for-guest-data-transfer-failures.patch + 5231e090-libxc-x86-fix-page-table-creation-for-huge-guests.patch + 5231f00c-cpufreq-missing-check-of-copy_from_guest.patch + 523304b6-x86-machine_restart-must-not-call-acpi_dmar_reinstate-twice.patch + 5239a064-x86-HVM-fix-failure-path-in-hvm_vcpu_initialise.patch + 5239a076-VMX-fix-failure-path-in-construct_vmcs.patch + 523c0ed4-x86-HVM-properly-handle-wide-MMIO.patch + 523c1758-sched_credit-filter-node-affinity-mask-against-online-cpus.patch + 523ff393-x86-HVM-linear-address-must-be-canonical-for-the-whole-accessed-range.patch + 523ff3e2-x86-HVM-refuse-doing-string-operations-in-certain-situations.patch + +------------------------------------------------------------------- +Wed Sep 25 18:54:24 CEST 2013 - ohering@suse.de + +- Use upstream version of unplugging in PVonHVM guests + add 523c1834-unmodified_drivers-enable-unplug-per-default.patch + remove disable_emulated_device.patch + +------------------------------------------------------------------- +Wed Sep 25 16:17:37 CEST 2013 - ohering@suse.de + +- fate#315714 - Support pvUSB in Xen HVM guests, add xen-usb.ko + ------------------------------------------------------------------- Mon Sep 9 09:26:18 MDT 2013 - carnold@suse.com diff --git a/xen.spec b/xen.spec index 497c34d..ee881dc 100644 --- a/xen.spec +++ b/xen.spec @@ -15,6 +15,7 @@ # Please submit bugfixes or comments via http://bugs.opensuse.org/ # + Name: xen ExclusiveArch: %ix86 x86_64 %arm aarch64 %define xvers 4.3 @@ -138,7 +139,7 @@ BuildRequires: xorg-x11 BuildRequires: lndir %endif %endif -Version: 4.3.0_10 +Version: 4.3.0_12 Release: 0 PreReq: %insserv_prereq %fillup_prereq Summary: Xen Virtualization: Hypervisor (aka VMM aka Microkernel) @@ -208,31 +209,54 @@ Patch12: 51e7963f-x86-time-Update-wallclock-in-shared-info-when-altering- Patch13: 51ffd577-fix-off-by-one-mistakes-in-vm_alloc.patch Patch14: 51ffd5fd-x86-refine-FPU-selector-handling-code-for-XSAVEOPT.patch Patch15: 520114bb-Nested-VMX-Flush-TLBs-and-Caches-if-paging-mode-changed.patch -Patch16: 520a24f6-x86-AMD-Fix-nested-svm-crash-due-to-assertion-in-__virt_to_maddr.patch -Patch17: 520a2570-x86-AMD-Inject-GP-instead-of-UD-when-unable-to-map-vmcb.patch -Patch18: 520a5504-VMX-add-boot-parameter-to-enable-disable-APIC-v-dynamically.patch -Patch19: 520b4b60-VT-d-protect-against-bogus-information-coming-from-BIOS.patch -Patch20: 520b4bda-x86-MTRR-fix-range-check-in-mtrr_add_page.patch -Patch21: 520cb8b6-x86-time-fix-check-for-negative-time-in-__update_vcpu_system_time.patch -Patch22: 52146070-ACPI-fix-acpi_os_map_memory.patch -Patch23: 5214d26a-VT-d-warn-about-CFI-being-enabled-by-firmware.patch -Patch24: 5215d094-Nested-VMX-Check-whether-interrupt-is-blocked-by-TPR.patch -Patch25: 5215d0c5-Nested-VMX-Force-check-ISR-when-L2-is-running.patch -Patch26: 5215d135-Nested-VMX-Clear-APIC-v-control-bit-in-vmcs02.patch -Patch27: 5215d2d5-Nested-VMX-Update-APIC-v-RVI-SVI-when-vmexit-to-L1.patch -Patch28: 5215d8b0-Correct-X2-APIC-HVM-emulation.patch -Patch29: 521c6d4a-x86-don-t-allow-Dom0-access-to-the-MSI-address-range.patch -Patch30: 521c6d6c-x86-don-t-allow-Dom0-access-to-the-HT-address-range.patch -Patch31: 521c6e23-x86-Intel-add-support-for-Haswell-CPU-models.patch -Patch32: 521db25f-Fix-inactive-timer-list-corruption-on-second-S3-resume.patch -Patch33: 521e1156-x86-AVX-instruction-emulation-fixes.patch -Patch34: 521ef8d9-AMD-IOMMU-add-missing-checks.patch -Patch35: 52205a7d-hvmloader-smbios-Correctly-count-the-number-of-tables-written.patch -Patch36: 52205a90-public-hvm_xs_strings.h-Fix-ABI-regression-for-OEM-SMBios-strings.patch -Patch37: 52205e27-x86-xsave-initialization-improvements.patch -Patch38: 5226020f-xend-handle-extended-PCI-configuration-space-when-saving-state.patch -Patch39: 52260214-xend-fix-file-descriptor-leak-in-pci-utilities.patch -Patch40: 52285317-hvmloader-fix-SeaBIOS-interface.patch +Patch16: 520119fc-xen-conring-Write-to-console-ring-even-if-console-lock-is-busted.patch +Patch17: 520a24f6-x86-AMD-Fix-nested-svm-crash-due-to-assertion-in-__virt_to_maddr.patch +Patch18: 520a2570-x86-AMD-Inject-GP-instead-of-UD-when-unable-to-map-vmcb.patch +Patch19: 520a2705-watchdog-crash-Always-disable-watchdog-in-console_force_unlock.patch +Patch20: 520a5504-VMX-add-boot-parameter-to-enable-disable-APIC-v-dynamically.patch +Patch21: 520b4b60-VT-d-protect-against-bogus-information-coming-from-BIOS.patch +Patch22: 520b4bda-x86-MTRR-fix-range-check-in-mtrr_add_page.patch +Patch23: 520cb8b6-x86-time-fix-check-for-negative-time-in-__update_vcpu_system_time.patch +Patch24: 52146070-ACPI-fix-acpi_os_map_memory.patch +Patch25: 5214d26a-VT-d-warn-about-CFI-being-enabled-by-firmware.patch +Patch26: 5215d094-Nested-VMX-Check-whether-interrupt-is-blocked-by-TPR.patch +Patch27: 5215d0c5-Nested-VMX-Force-check-ISR-when-L2-is-running.patch +Patch28: 5215d135-Nested-VMX-Clear-APIC-v-control-bit-in-vmcs02.patch +Patch29: 5215d2d5-Nested-VMX-Update-APIC-v-RVI-SVI-when-vmexit-to-L1.patch +Patch30: 5215d8b0-Correct-X2-APIC-HVM-emulation.patch +Patch31: 521c6d4a-x86-don-t-allow-Dom0-access-to-the-MSI-address-range.patch +Patch32: 521c6d6c-x86-don-t-allow-Dom0-access-to-the-HT-address-range.patch +Patch33: 521c6e23-x86-Intel-add-support-for-Haswell-CPU-models.patch +Patch34: 521db25f-Fix-inactive-timer-list-corruption-on-second-S3-resume.patch +Patch35: 521e1156-x86-AVX-instruction-emulation-fixes.patch +Patch36: 521ef8d9-AMD-IOMMU-add-missing-checks.patch +Patch37: 52205a7d-hvmloader-smbios-Correctly-count-the-number-of-tables-written.patch +Patch38: 52205a90-public-hvm_xs_strings.h-Fix-ABI-regression-for-OEM-SMBios-strings.patch +Patch39: 52205e27-x86-xsave-initialization-improvements.patch +Patch40: 5226020f-xend-handle-extended-PCI-configuration-space-when-saving-state.patch +Patch41: 52260214-xend-fix-file-descriptor-leak-in-pci-utilities.patch +Patch42: 52285317-hvmloader-fix-SeaBIOS-interface.patch +Patch43: 522d896b-x86-EFI-properly-handle-run-time-memory-regions-outside-the-1-1-map.patch +Patch44: 522d8a1f-x86-allow-guest-to-set-clear-MSI-X-mask-bit-try-2.patch +Patch45: 522dc044-xmalloc-make-whole-pages-xfree-clear-the-order-field-ab-used-by-xmalloc.patch +Patch46: 522dc0e6-x86-xsave-fix-migration-from-xsave-capable-to-xsave-incapable-host.patch +Patch47: 522f2f9f-Nested-VMX-Clear-bit-31-of-IA32_VMX_BASIC-MSR.patch +Patch48: 522f37b2-sched-arinc653-check-for-guest-data-transfer-failures.patch +Patch49: 523172d5-x86-fix-memory-cut-off-when-using-PFN-compression.patch +Patch50: 5231e090-libxc-x86-fix-page-table-creation-for-huge-guests.patch +Patch51: 5231f00c-cpufreq-missing-check-of-copy_from_guest.patch +Patch52: 523304b6-x86-machine_restart-must-not-call-acpi_dmar_reinstate-twice.patch +Patch53: 5239a064-x86-HVM-fix-failure-path-in-hvm_vcpu_initialise.patch +Patch54: 5239a076-VMX-fix-failure-path-in-construct_vmcs.patch +Patch55: 523c0ed4-x86-HVM-properly-handle-wide-MMIO.patch +Patch56: 523c1758-sched_credit-filter-node-affinity-mask-against-online-cpus.patch +Patch57: 523c1834-unmodified_drivers-enable-unplug-per-default.patch +Patch58: 523ff393-x86-HVM-linear-address-must-be-canonical-for-the-whole-accessed-range.patch +Patch59: 523ff3e2-x86-HVM-refuse-doing-string-operations-in-certain-situations.patch +Patch60: 5242a1b5-x86-xsave-initialize-extended-register-state-when-guests-enable-it.patch +Patch6300: CVE-2013-4355-xsa63.patch +Patch6400: CVE-2013-4356-xsa64.patch +Patch6600: CVE-2013-4361-xsa66.patch # Upstream qemu patches # Our patches Patch301: xen-destdir.patch @@ -270,7 +294,7 @@ Patch503: x86-dom-print.patch Patch504: x86-extra-trap-info.patch Patch520: supported_module.patch Patch521: magic_ioport_compat.patch -Patch523: disable_emulated_device.patch +Patch522: xen_pvonhvm.pvusb.patch # Legacy Xend and Qemu patches Patch800: xend-traditional-qemu.patch # Build patches @@ -561,6 +585,29 @@ Authors %patch38 -p1 %patch39 -p1 %patch40 -p1 +%patch41 -p1 +%patch42 -p1 +%patch43 -p1 +%patch44 -p1 +%patch45 -p1 +%patch46 -p1 +%patch47 -p1 +%patch48 -p1 +%patch49 -p1 +%patch50 -p1 +%patch51 -p1 +%patch52 -p1 +%patch53 -p1 +%patch54 -p1 +%patch55 -p1 +%patch56 -p1 +%patch57 -p1 +%patch58 -p1 +%patch59 -p1 +%patch60 -p1 +%patch6300 -p1 +%patch6400 -p1 +%patch6600 -p1 %patch301 -p1 %patch302 -p1 %patch303 -p1 @@ -593,7 +640,7 @@ Authors %patch504 -p1 %patch520 -p1 %patch521 -p1 -%patch523 -p1 +%patch522 -p1 %patch800 -p1 %patch99997 -p1 %patch99998 -p1 diff --git a/xen_pvonhvm.pvusb.patch b/xen_pvonhvm.pvusb.patch new file mode 100644 index 0000000..527f10e --- /dev/null +++ b/xen_pvonhvm.pvusb.patch @@ -0,0 +1,35 @@ +fate#315714: Support pvUSB in Xen HVM guests +--- + unmodified_drivers/linux-2.6/Makefile | 1 + + unmodified_drivers/linux-2.6/usbfront/Kbuild | 6 ++++++ + unmodified_drivers/linux-2.6/usbfront/Makefile | 3 +++ + 3 files changed, 10 insertions(+) + +Index: xen-4.3.0-testing/unmodified_drivers/linux-2.6/Makefile +=================================================================== +--- xen-4.3.0-testing.orig/unmodified_drivers/linux-2.6/Makefile ++++ xen-4.3.0-testing/unmodified_drivers/linux-2.6/Makefile +@@ -5,3 +5,4 @@ obj-m += balloon/ + obj-m += blkfront/ + obj-m += netfront/ + obj-m += scsifront/ ++obj-m += usbfront/ +Index: xen-4.3.0-testing/unmodified_drivers/linux-2.6/usbfront/Kbuild +=================================================================== +--- /dev/null ++++ xen-4.3.0-testing/unmodified_drivers/linux-2.6/usbfront/Kbuild +@@ -0,0 +1,6 @@ ++include $(M)/overrides.mk ++ ++obj-m += xen-usb.o ++ ++xen-usb-objs := usbfront-hcd.o xenbus.o ++ +Index: xen-4.3.0-testing/unmodified_drivers/linux-2.6/usbfront/Makefile +=================================================================== +--- /dev/null ++++ xen-4.3.0-testing/unmodified_drivers/linux-2.6/usbfront/Makefile +@@ -0,0 +1,3 @@ ++ifneq ($(KERNELRELEASE),) ++include $(src)/Kbuild ++endif