From 763b78040df93cf1d44387df791467c5ec87d4858300316fa455c735200bb046 Mon Sep 17 00:00:00 2001 From: Charles Arnold Date: Fri, 10 Jul 2015 15:21:29 +0000 Subject: [PATCH] - bnc#935634 - VUL-0: CVE-2015-3259: xen: XSA-137: xl command line config handling stack overflow CVE-2015-3259-xsa137.patch - Upstream patches from Jan 558bfaa0-x86-traps-avoid-using-current-too-early.patch 5592a116-nested-EPT-fix-the-handling-of-nested-EPT.patch 559b9dd6-x86-p2m-ept-don-t-unmap-in-use-EPT-pagetable.patch 559bdde5-pull-in-latest-linux-earlycpio.patch - Upstream patches from Jan pending review 552d0fd2-x86-hvm-don-t-include-asm-spinlock-h.patch 552d0fe8-x86-mtrr-include-asm-atomic.h.patch 552d293b-x86-vMSI-X-honor-all-mask-requests.patch 552d2966-x86-vMSI-X-add-valid-bits-for-read-acceleration.patch 554c7aee-x86-provide-arch_fetch_and_add.patch 554c7b00-arm-provide-arch_fetch_and_add.patch 55534b0a-x86-provide-add_sized.patch 55534b25-arm-provide-add_sized.patch 5555a4f8-use-ticket-locks-for-spin-locks.patch 5555a5b9-x86-arm-remove-asm-spinlock-h.patch 5555a8ec-introduce-non-contiguous-allocation.patch 55795a52-x86-vMSI-X-support-qword-MMIO-access.patch 557eb55f-gnttab-per-active-entry-locking.patch 557eb5b6-gnttab-introduce-maptrack-lock.patch 557eb620-gnttab-make-the-grant-table-lock-a-read-write-lock.patch 557ffab8-evtchn-factor-out-freeing-an-event-channel.patch 5582bf43-evtchn-simplify-port_is_valid.patch 5582bf81-evtchn-remove-the-locking-when-unmasking-an-event-channel.patch 5583d9c5-x86-MSI-X-cleanup.patch 5583da09-x86-MSI-track-host-and-guest-masking-separately.patch 5583da64-gnttab-use-per-VCPU-maptrack-free-lists.patch OBS-URL: https://build.opensuse.org/package/show/Virtualization/xen?expand=0&rev=369 --- ...x86-hvm-don-t-include-asm-spinlock-h.patch | 41 + 552d0fe8-x86-mtrr-include-asm-atomic.h.patch | 22 + ...b-x86-vMSI-X-honor-all-mask-requests.patch | 44 ++ ...add-valid-bits-for-read-acceleration.patch | 56 ++ 554c7aee-x86-provide-arch_fetch_and_add.patch | 68 ++ 554c7b00-arm-provide-arch_fetch_and_add.patch | 29 + 55534b0a-x86-provide-add_sized.patch | 65 ++ 55534b25-arm-provide-add_sized.patch | 64 ++ ...a4f8-use-ticket-locks-for-spin-locks.patch | 305 ++++++++ 5555a5b9-x86-arm-remove-asm-spinlock-h.patch | 266 +++++++ ...-introduce-non-contiguous-allocation.patch | 141 ++++ ...x86-vMSI-X-support-qword-MMIO-access.patch | 97 +++ ...b55f-gnttab-per-active-entry-locking.patch | 551 +++++++++++++ 557eb5b6-gnttab-introduce-maptrack-lock.patch | 86 ++ ...e-grant-table-lock-a-read-write-lock.patch | 733 ++++++++++++++++++ ...-factor-out-freeing-an-event-channel.patch | 47 ++ 5582bf43-evtchn-simplify-port_is_valid.patch | 69 ++ ...king-when-unmasking-an-event-channel.patch | 32 + 5583d9c5-x86-MSI-X-cleanup.patch | 285 +++++++ ...ck-host-and-guest-masking-separately.patch | 122 ++- ...tab-use-per-VCPU-maptrack-free-lists.patch | 284 +++++++ ...al-maptrack-entries-from-other-VCPUs.patch | 153 ++++ ...ear-xen_consumer-when-clearing-state.patch | 105 +++ ...-evtchn-s-until-evtchn_destroy_final.patch | 110 +++ ...vent-channel-lock-for-sending-events.patch | 257 ++++++ ...evtchn-pad-struct-evtchn-to-64-bytes.patch | 27 + ...-traps-avoid-using-current-too-early.patch | 23 + ...d-EPT-fix-the-handling-of-nested-EPT.patch | 50 ++ ...ept-don-t-unmap-in-use-EPT-pagetable.patch | 64 ++ 559bdde5-pull-in-latest-linux-earlycpio.patch | 102 +++ CVE-2015-3259-xsa137.patch | 216 ++++++ libxl.pvscsi.patch | 74 +- qemu-MSI-X-enable-maskall.patch | 333 -------- qemu-MSI-X-latch-writes.patch | 141 ---- x86-MSI-X-enable.patch | 101 ++- x86-MSI-X-maskall.patch | 423 ++-------- x86-MSI-X-teardown.patch | 101 ++- x86-MSI-mask.patch | 48 ++ x86-MSI-pv-unmask.patch | 93 +++ x86-PCI-CFG-write-intercept.patch | 114 +++ x86-pci_cfg_okay.patch | 156 ++++ xen.changes | 50 ++ xen.spec | 113 ++- 43 files changed, 5205 insertions(+), 1056 deletions(-) create mode 100644 552d0fd2-x86-hvm-don-t-include-asm-spinlock-h.patch create mode 100644 552d0fe8-x86-mtrr-include-asm-atomic.h.patch create mode 100644 552d293b-x86-vMSI-X-honor-all-mask-requests.patch create mode 100644 552d2966-x86-vMSI-X-add-valid-bits-for-read-acceleration.patch create mode 100644 554c7aee-x86-provide-arch_fetch_and_add.patch create mode 100644 554c7b00-arm-provide-arch_fetch_and_add.patch create mode 100644 55534b0a-x86-provide-add_sized.patch create mode 100644 55534b25-arm-provide-add_sized.patch create mode 100644 5555a4f8-use-ticket-locks-for-spin-locks.patch create mode 100644 5555a5b9-x86-arm-remove-asm-spinlock-h.patch create mode 100644 5555a8ec-introduce-non-contiguous-allocation.patch create mode 100644 55795a52-x86-vMSI-X-support-qword-MMIO-access.patch create mode 100644 557eb55f-gnttab-per-active-entry-locking.patch create mode 100644 557eb5b6-gnttab-introduce-maptrack-lock.patch create mode 100644 557eb620-gnttab-make-the-grant-table-lock-a-read-write-lock.patch create mode 100644 557ffab8-evtchn-factor-out-freeing-an-event-channel.patch create mode 100644 5582bf43-evtchn-simplify-port_is_valid.patch create mode 100644 5582bf81-evtchn-remove-the-locking-when-unmasking-an-event-channel.patch create mode 100644 5583d9c5-x86-MSI-X-cleanup.patch rename x86-MSI-X-guest-mask.patch => 5583da09-x86-MSI-track-host-and-guest-masking-separately.patch (73%) create mode 100644 5583da64-gnttab-use-per-VCPU-maptrack-free-lists.patch create mode 100644 5583da8c-gnttab-steal-maptrack-entries-from-other-VCPUs.patch create mode 100644 5587d711-evtchn-clear-xen_consumer-when-clearing-state.patch create mode 100644 5587d779-evtchn-defer-freeing-struct-evtchn-s-until-evtchn_destroy_final.patch create mode 100644 5587d7b7-evtchn-use-a-per-event-channel-lock-for-sending-events.patch create mode 100644 5587d7e2-evtchn-pad-struct-evtchn-to-64-bytes.patch create mode 100644 558bfaa0-x86-traps-avoid-using-current-too-early.patch create mode 100644 5592a116-nested-EPT-fix-the-handling-of-nested-EPT.patch create mode 100644 559b9dd6-x86-p2m-ept-don-t-unmap-in-use-EPT-pagetable.patch create mode 100644 559bdde5-pull-in-latest-linux-earlycpio.patch create mode 100644 CVE-2015-3259-xsa137.patch delete mode 100644 qemu-MSI-X-enable-maskall.patch delete mode 100644 qemu-MSI-X-latch-writes.patch create mode 100644 x86-MSI-mask.patch create mode 100644 x86-MSI-pv-unmask.patch create mode 100644 x86-PCI-CFG-write-intercept.patch create mode 100644 x86-pci_cfg_okay.patch diff --git a/552d0fd2-x86-hvm-don-t-include-asm-spinlock-h.patch b/552d0fd2-x86-hvm-don-t-include-asm-spinlock-h.patch new file mode 100644 index 0000000..49373f4 --- /dev/null +++ b/552d0fd2-x86-hvm-don-t-include-asm-spinlock-h.patch @@ -0,0 +1,41 @@ +# Commit 63dcef9fe5b880007075b5eb53f9950a826519ce +# Date 2015-04-14 15:02:10 +0200 +# Author David Vrabel +# Committer Jan Beulich +x86/hvm: don't include asm/spinlock.h + +asm/spinlock.h should not be included directly. + +Signed-off-by: David Vrabel +Acked-by: Andrew Cooper + +--- sle12sp1.orig/xen/arch/x86/hvm/hvm.c 2015-07-08 14:13:16.000000000 +0200 ++++ sle12sp1/xen/arch/x86/hvm/hvm.c 2015-07-08 14:13:38.000000000 +0200 +@@ -52,7 +52,6 @@ + #include + #include + #include +-#include + #include + #include + #include +--- sle12sp1.orig/xen/arch/x86/hvm/svm/svm.c 2015-01-14 18:44:18.000000000 +0100 ++++ sle12sp1/xen/arch/x86/hvm/svm/svm.c 2015-07-08 14:13:38.000000000 +0200 +@@ -41,7 +41,6 @@ + #include + #include + #include +-#include + #include + #include + #include +--- sle12sp1.orig/xen/arch/x86/hvm/vmx/vmx.c 2015-05-19 23:16:48.000000000 +0200 ++++ sle12sp1/xen/arch/x86/hvm/vmx/vmx.c 2015-07-08 14:13:38.000000000 +0200 +@@ -35,7 +35,6 @@ + #include + #include + #include +-#include + #include + #include + #include diff --git a/552d0fe8-x86-mtrr-include-asm-atomic.h.patch b/552d0fe8-x86-mtrr-include-asm-atomic.h.patch new file mode 100644 index 0000000..474e46d --- /dev/null +++ b/552d0fe8-x86-mtrr-include-asm-atomic.h.patch @@ -0,0 +1,22 @@ +# Commit f70df9ec1ab72b6bbebad72d81109c1b214007e1 +# Date 2015-04-14 15:02:32 +0200 +# Author David Vrabel +# Committer Jan Beulich +x86/mtrr: include asm/atomic.h + +asm/atomic.h is needed but only included indirectly via +asm/spinlock.h. + +Signed-off-by: David Vrabel +Reviewed-by: Andrew Cooper + +--- sle12sp1.orig/xen/arch/x86/cpu/mtrr/main.c 2015-01-14 18:44:18.000000000 +0100 ++++ sle12sp1/xen/arch/x86/cpu/mtrr/main.c 2015-07-08 14:13:42.000000000 +0200 +@@ -36,6 +36,7 @@ + #include + #include + #include ++#include + #include + #include + #include diff --git a/552d293b-x86-vMSI-X-honor-all-mask-requests.patch b/552d293b-x86-vMSI-X-honor-all-mask-requests.patch new file mode 100644 index 0000000..bbae1d3 --- /dev/null +++ b/552d293b-x86-vMSI-X-honor-all-mask-requests.patch @@ -0,0 +1,44 @@ +# Commit 70a3cbb8c9cb17a61fa25c48ba3d7b44fd059c90 +# Date 2015-04-14 16:50:35 +0200 +# Author Jan Beulich +# Committer Jan Beulich +x86/vMSI-X: honor all mask requests + +Commit 74fd0036de ("x86: properly handle MSI-X unmask operation from +guests") didn't go far enough: it fixed an issue with unmasking, but +left an issue with masking in place: Due to the (late) point in time +when qemu requests the hypervisor to set up MSI-X interrupts (which is +where the MMIO intercept gets put in place), the hypervisor doesn't +see all guest writes, and hence shouldn't make assumptions on the state +the virtual MSI-X resources are in. Bypassing the rest of the logic on +a guest mask operation leads to + +[00:04.0] pci_msix_write: Error: Can't update msix entry 1 since MSI-X is already enabled. + +which surprisingly enough doesn't lead to the device not working +anymore (I didn't dig in deep enough to figure out why that is). But it +does prevent the IRQ to be migrated inside the guest, i.e. all +interrupts will always arrive in vCPU 0. + +Signed-off-by: Jan Beulich +Reviewed-by: Konrad Rzeszutek Wilk +Reviewed-by: Andrew Cooper + +--- sle12sp1.orig/xen/arch/x86/hvm/vmsi.c 2015-07-08 11:22:13.000000000 +0200 ++++ sle12sp1/xen/arch/x86/hvm/vmsi.c 2015-04-20 09:30:29.000000000 +0200 +@@ -286,11 +286,11 @@ static int msixtbl_write(struct vcpu *v, + goto out; + } + +- /* exit to device model if address/data has been modified */ +- if ( test_and_clear_bit(nr_entry, &entry->table_flags) ) ++ /* Exit to device model when unmasking and address/data got modified. */ ++ if ( !(val & PCI_MSIX_VECTOR_BITMASK) && ++ test_and_clear_bit(nr_entry, &entry->table_flags) ) + { +- if ( !(val & PCI_MSIX_VECTOR_BITMASK) ) +- v->arch.hvm_vcpu.hvm_io.msix_unmask_address = address; ++ v->arch.hvm_vcpu.hvm_io.msix_unmask_address = address; + goto out; + } + diff --git a/552d2966-x86-vMSI-X-add-valid-bits-for-read-acceleration.patch b/552d2966-x86-vMSI-X-add-valid-bits-for-read-acceleration.patch new file mode 100644 index 0000000..9fbabcb --- /dev/null +++ b/552d2966-x86-vMSI-X-add-valid-bits-for-read-acceleration.patch @@ -0,0 +1,56 @@ +# Commit df9f5676b3711c95127d44e871ad7ca38d6ed28a +# Date 2015-04-14 16:51:18 +0200 +# Author Jan Beulich +# Committer Jan Beulich +x86/vMSI-X: add valid bits for read acceleration + +Again because Xen doesn't get to see all guest writes, it shouldn't +serve reads from its cache before having seen a write to the respective +address. + +Also use DECLARE_BITMAP() in a related field declaration instead of +open coding it. + +Signed-off-by: Jan Beulich +Reviewed-by: Konrad Rzeszutek Wilk +Reviewed-by: Andrew Cooper + +--- sle12sp1.orig/xen/arch/x86/hvm/vmsi.c 2015-04-20 09:30:29.000000000 +0200 ++++ sle12sp1/xen/arch/x86/hvm/vmsi.c 2015-04-20 09:32:57.000000000 +0200 +@@ -154,11 +154,14 @@ struct msixtbl_entry + struct pci_dev *pdev; + unsigned long gtable; /* gpa of msix table */ + unsigned long table_len; +- unsigned long table_flags[BITS_TO_LONGS(MAX_MSIX_TABLE_ENTRIES)]; ++ DECLARE_BITMAP(table_flags, MAX_MSIX_TABLE_ENTRIES); + #define MAX_MSIX_ACC_ENTRIES 3 + struct { + uint32_t msi_ad[3]; /* Shadow of address low, high and data */ + } gentries[MAX_MSIX_ACC_ENTRIES]; ++ DECLARE_BITMAP(acc_valid, 3 * MAX_MSIX_ACC_ENTRIES); ++#define acc_bit(what, ent, slot, idx) \ ++ what##_bit((slot) * 3 + (idx), (ent)->acc_valid) + struct rcu_head rcu; + }; + +@@ -233,9 +236,10 @@ static int msixtbl_read( + if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) + { + nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; +- if ( nr_entry >= MAX_MSIX_ACC_ENTRIES ) +- goto out; + index = offset / sizeof(uint32_t); ++ if ( nr_entry >= MAX_MSIX_ACC_ENTRIES || ++ !acc_bit(test, entry, nr_entry, index) ) ++ goto out; + *pval = entry->gentries[nr_entry].msi_ad[index]; + } + else +@@ -281,6 +285,7 @@ static int msixtbl_write(struct vcpu *v, + { + index = offset / sizeof(uint32_t); + entry->gentries[nr_entry].msi_ad[index] = val; ++ acc_bit(set, entry, nr_entry, index); + } + set_bit(nr_entry, &entry->table_flags); + goto out; diff --git a/554c7aee-x86-provide-arch_fetch_and_add.patch b/554c7aee-x86-provide-arch_fetch_and_add.patch new file mode 100644 index 0000000..181effb --- /dev/null +++ b/554c7aee-x86-provide-arch_fetch_and_add.patch @@ -0,0 +1,68 @@ +# Commit 2bfc9fc52ce8485fa43e79bbdc32360c74e12fe8 +# Date 2015-05-08 10:59:26 +0200 +# Author David Vrabel +# Committer Jan Beulich +x86: provide arch_fetch_and_add() + +arch_fetch_and_add() atomically adds a value and returns the previous +value. + +This is needed to implement ticket locks. + +Signed-off-by: David Vrabel + +--- sle12sp1.orig/xen/include/asm-x86/system.h 2015-01-14 18:44:18.000000000 +0100 ++++ sle12sp1/xen/include/asm-x86/system.h 2015-07-08 12:35:11.000000000 +0200 +@@ -118,6 +118,52 @@ static always_inline unsigned long __cmp + }) + + /* ++ * Undefined symbol to cause link failure if a wrong size is used with ++ * arch_fetch_and_add(). ++ */ ++extern unsigned long __bad_fetch_and_add_size(void); ++ ++static always_inline unsigned long __xadd( ++ volatile void *ptr, unsigned long v, int size) ++{ ++ switch ( size ) ++ { ++ case 1: ++ asm volatile ( "lock; xaddb %b0,%1" ++ : "+r" (v), "+m" (*__xg(ptr)) ++ :: "memory"); ++ return v; ++ case 2: ++ asm volatile ( "lock; xaddw %w0,%1" ++ : "+r" (v), "+m" (*__xg(ptr)) ++ :: "memory"); ++ return v; ++ case 4: ++ asm volatile ( "lock; xaddl %k0,%1" ++ : "+r" (v), "+m" (*__xg(ptr)) ++ :: "memory"); ++ return v; ++ case 8: ++ asm volatile ( "lock; xaddq %q0,%1" ++ : "+r" (v), "+m" (*__xg(ptr)) ++ :: "memory"); ++ ++ return v; ++ default: ++ return __bad_fetch_and_add_size(); ++ } ++} ++ ++/* ++ * Atomically add @v to the 1, 2, 4, or 8 byte value at @ptr. Returns ++ * the previous value. ++ * ++ * This is a full memory barrier. ++ */ ++#define arch_fetch_and_add(ptr, v) \ ++ ((typeof(*(ptr)))__xadd(ptr, (typeof(*(ptr)))(v), sizeof(*(ptr)))) ++ ++/* + * Both Intel and AMD agree that, from a programmer's viewpoint: + * Loads cannot be reordered relative to other loads. + * Stores cannot be reordered relative to other stores. diff --git a/554c7b00-arm-provide-arch_fetch_and_add.patch b/554c7b00-arm-provide-arch_fetch_and_add.patch new file mode 100644 index 0000000..e24e6a6 --- /dev/null +++ b/554c7b00-arm-provide-arch_fetch_and_add.patch @@ -0,0 +1,29 @@ +# Commit f9cc3cd9b4de58cf032c8624406384c172937e57 +# Date 2015-05-08 10:59:44 +0200 +# Author David Vrabel +# Committer Jan Beulich +arm: provide arch_fetch_and_add() + +arch_fetch_and_add() atomically adds a value and returns the previous +value. + +This generic arm implementation uses the GCC __sync_fetch_and_add() +builtin. This builtin resulted in suitable inlined asm for GCC 4.8.3 +(arm64) and GCC 4.6.3 (arm32). + +This is needed to implement ticket locks. + +Signed-off-by: David Vrabel +Acked-by: Ian Campbell + +--- sle12sp1.orig/xen/include/asm-arm/system.h 2015-01-14 18:44:18.000000000 +0100 ++++ sle12sp1/xen/include/asm-arm/system.h 2015-07-08 12:35:16.000000000 +0200 +@@ -51,6 +51,8 @@ + # error "unknown ARM variant" + #endif + ++#define arch_fetch_and_add(x, v) __sync_fetch_and_add(x, v) ++ + extern struct vcpu *__context_switch(struct vcpu *prev, struct vcpu *next); + + #endif diff --git a/55534b0a-x86-provide-add_sized.patch b/55534b0a-x86-provide-add_sized.patch new file mode 100644 index 0000000..f771b4c --- /dev/null +++ b/55534b0a-x86-provide-add_sized.patch @@ -0,0 +1,65 @@ +# Commit 3c694aec08dda782d9c866e599b848dff86f474f +# Date 2015-05-13 15:00:58 +0200 +# Author David Vrabel +# Committer Jan Beulich +x86: provide add_sized() + +add_sized(ptr, inc) adds inc to the value at ptr using only the correct +size of loads and stores for the type of *ptr. The add is /not/ atomic. + +This is needed for ticket locks to ensure the increment of the head ticket +does not affect the tail ticket. + +Signed-off-by: David Vrabel + +--- sle12sp1.orig/xen/include/asm-x86/atomic.h 2015-01-14 18:44:18.000000000 +0100 ++++ sle12sp1/xen/include/asm-x86/atomic.h 2015-07-08 12:35:20.000000000 +0200 +@@ -14,6 +14,14 @@ static inline void name(volatile type *a + { asm volatile("mov" size " %1,%0": "=m" (*(volatile type *)addr) \ + :reg (val) barrier); } + ++#define build_add_sized(name, size, type, reg) \ ++ static inline void name(volatile type *addr, type val) \ ++ { \ ++ asm volatile("add" size " %1,%0" \ ++ : "=m" (*addr) \ ++ : reg (val)); \ ++ } ++ + build_read_atomic(read_u8_atomic, "b", uint8_t, "=q", ) + build_read_atomic(read_u16_atomic, "w", uint16_t, "=r", ) + build_read_atomic(read_u32_atomic, "l", uint32_t, "=r", ) +@@ -25,8 +33,14 @@ build_write_atomic(write_u32_atomic, "l" + build_read_atomic(read_u64_atomic, "q", uint64_t, "=r", ) + build_write_atomic(write_u64_atomic, "q", uint64_t, "r", ) + ++build_add_sized(add_u8_sized, "b", uint8_t, "qi") ++build_add_sized(add_u16_sized, "w", uint16_t, "ri") ++build_add_sized(add_u32_sized, "l", uint32_t, "ri") ++build_add_sized(add_u64_sized, "q", uint64_t, "ri") ++ + #undef build_read_atomic + #undef build_write_atomic ++#undef build_add_sized + + void __bad_atomic_size(void); + +@@ -54,6 +68,18 @@ void __bad_atomic_size(void); + __x; \ + }) + ++#define add_sized(p, x) ({ \ ++ typeof(*(p)) x_ = (x); \ ++ switch ( sizeof(*(p)) ) \ ++ { \ ++ case 1: add_u8_sized((uint8_t *)(p), x_); break; \ ++ case 2: add_u16_sized((uint16_t *)(p), x_); break; \ ++ case 4: add_u32_sized((uint32_t *)(p), x_); break; \ ++ case 8: add_u64_sized((uint64_t *)(p), x_); break; \ ++ default: __bad_atomic_size(); break; \ ++ } \ ++}) ++ + /* + * NB. I've pushed the volatile qualifier into the operations. This allows + * fast accessors such as _atomic_read() and _atomic_set() which don't give diff --git a/55534b25-arm-provide-add_sized.patch b/55534b25-arm-provide-add_sized.patch new file mode 100644 index 0000000..e843fa5 --- /dev/null +++ b/55534b25-arm-provide-add_sized.patch @@ -0,0 +1,64 @@ +# Commit 890674d13feb4a270aa112ca452dcf62fdd53f34 +# Date 2015-05-13 15:01:25 +0200 +# Author David Vrabel +# Committer Jan Beulich +arm: provide add_sized() + +add_sized(ptr, inc) adds inc to the value at ptr using only the correct +size of loads and stores for the type of *ptr. The add is /not/ atomic. + +This is needed for ticket locks to ensure the increment of the head ticket +does not affect the tail ticket. + +Signed-off-by: David Vrabel +Acked-by: Ian Campbell + +--- sle12sp1.orig/xen/include/asm-arm/atomic.h 2015-01-14 18:44:18.000000000 +0100 ++++ sle12sp1/xen/include/asm-arm/atomic.h 2015-07-08 12:35:55.000000000 +0200 +@@ -23,6 +23,17 @@ static inline void name(volatile type *a + : reg (val)); \ + } + ++#define build_add_sized(name, size, width, type, reg) \ ++static inline void name(volatile type *addr, type val) \ ++{ \ ++ type t; \ ++ asm volatile("ldr" size " %"width"1,%0\n" \ ++ "add %"width"1,%"width"1,%"width"2\n" \ ++ "str" size " %"width"1,%0" \ ++ : "=m" (*(volatile type *)addr), "=r" (t) \ ++ : reg (val)); \ ++} ++ + #if defined (CONFIG_ARM_32) + #define BYTE "" + #define WORD "" +@@ -46,6 +57,10 @@ build_atomic_read(read_u64_atomic, "x", + build_atomic_write(write_u64_atomic, "x", uint64_t, "r") + #endif + ++build_add_sized(add_u8_sized, "b", BYTE, uint8_t, "ri") ++build_add_sized(add_u16_sized, "h", WORD, uint16_t, "ri") ++build_add_sized(add_u32_sized, "", WORD, uint32_t, "ri") ++ + void __bad_atomic_size(void); + + #define read_atomic(p) ({ \ +@@ -70,6 +85,17 @@ void __bad_atomic_size(void); + __x; \ + }) + ++#define add_sized(p, x) ({ \ ++ typeof(*(p)) __x = (x); \ ++ switch ( sizeof(*(p)) ) \ ++ { \ ++ case 1: add_u8_sized((uint8_t *)(p), __x); break; \ ++ case 2: add_u16_sized((uint16_t *)(p), __x); break; \ ++ case 4: add_u32_sized((uint32_t *)(p), __x); break; \ ++ default: __bad_atomic_size(); break; \ ++ } \ ++}) ++ + /* + * NB. I've pushed the volatile qualifier into the operations. This allows + * fast accessors such as _atomic_read() and _atomic_set() which don't give diff --git a/5555a4f8-use-ticket-locks-for-spin-locks.patch b/5555a4f8-use-ticket-locks-for-spin-locks.patch new file mode 100644 index 0000000..4109167 --- /dev/null +++ b/5555a4f8-use-ticket-locks-for-spin-locks.patch @@ -0,0 +1,305 @@ +# Commit 45fcc4568c5162b00fb3907fb158af82dd484a3d +# Date 2015-05-15 09:49:12 +0200 +# Author David Vrabel +# Committer Jan Beulich +use ticket locks for spin locks + +Replace the byte locks with ticket locks. Ticket locks are: a) fair; +and b) peform better when contented since they spin without an atomic +operation. + +The lock is split into two ticket values: head and tail. A locker +acquires a ticket by (atomically) increasing tail and using the +previous tail value. A CPU holds the lock if its ticket == head. The +lock is released by increasing head. + +spin_lock_irq() and spin_lock_irqsave() now spin with irqs disabled +(previously, they would spin with irqs enabled if possible). This is +required to prevent deadlocks when the irq handler tries to take the +same lock with a higher ticket. + +Architectures need only provide arch_fetch_and_add() and two barriers: +arch_lock_acquire_barrier() and arch_lock_release_barrier(). + +Signed-off-by: David Vrabel +Reviewed-by: Tim Deegan +Reviewed-by: Jan Beulich + +--- sle12sp1.orig/xen/common/spinlock.c 2015-01-14 18:44:18.000000000 +0100 ++++ sle12sp1/xen/common/spinlock.c 2015-07-08 12:37:59.000000000 +0200 +@@ -115,125 +115,134 @@ void spin_debug_disable(void) + + #endif + ++static always_inline spinlock_tickets_t observe_lock(spinlock_tickets_t *t) ++{ ++ spinlock_tickets_t v; ++ ++ smp_rmb(); ++ v.head_tail = read_atomic(&t->head_tail); ++ return v; ++} ++ ++static always_inline u16 observe_head(spinlock_tickets_t *t) ++{ ++ smp_rmb(); ++ return read_atomic(&t->head); ++} ++ + void _spin_lock(spinlock_t *lock) + { ++ spinlock_tickets_t tickets = SPINLOCK_TICKET_INC; + LOCK_PROFILE_VAR; + + check_lock(&lock->debug); +- while ( unlikely(!_raw_spin_trylock(&lock->raw)) ) ++ tickets.head_tail = arch_fetch_and_add(&lock->tickets.head_tail, ++ tickets.head_tail); ++ while ( tickets.tail != observe_head(&lock->tickets) ) + { + LOCK_PROFILE_BLOCK; +- while ( likely(_raw_spin_is_locked(&lock->raw)) ) +- cpu_relax(); ++ cpu_relax(); + } + LOCK_PROFILE_GOT; + preempt_disable(); ++ arch_lock_acquire_barrier(); + } + + void _spin_lock_irq(spinlock_t *lock) + { +- LOCK_PROFILE_VAR; +- + ASSERT(local_irq_is_enabled()); + local_irq_disable(); +- check_lock(&lock->debug); +- while ( unlikely(!_raw_spin_trylock(&lock->raw)) ) +- { +- LOCK_PROFILE_BLOCK; +- local_irq_enable(); +- while ( likely(_raw_spin_is_locked(&lock->raw)) ) +- cpu_relax(); +- local_irq_disable(); +- } +- LOCK_PROFILE_GOT; +- preempt_disable(); ++ _spin_lock(lock); + } + + unsigned long _spin_lock_irqsave(spinlock_t *lock) + { + unsigned long flags; +- LOCK_PROFILE_VAR; + + local_irq_save(flags); +- check_lock(&lock->debug); +- while ( unlikely(!_raw_spin_trylock(&lock->raw)) ) +- { +- LOCK_PROFILE_BLOCK; +- local_irq_restore(flags); +- while ( likely(_raw_spin_is_locked(&lock->raw)) ) +- cpu_relax(); +- local_irq_save(flags); +- } +- LOCK_PROFILE_GOT; +- preempt_disable(); ++ _spin_lock(lock); + return flags; + } + + void _spin_unlock(spinlock_t *lock) + { ++ arch_lock_release_barrier(); + preempt_enable(); + LOCK_PROFILE_REL; +- _raw_spin_unlock(&lock->raw); ++ add_sized(&lock->tickets.head, 1); + } + + void _spin_unlock_irq(spinlock_t *lock) + { +- preempt_enable(); +- LOCK_PROFILE_REL; +- _raw_spin_unlock(&lock->raw); ++ _spin_unlock(lock); + local_irq_enable(); + } + + void _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) + { +- preempt_enable(); +- LOCK_PROFILE_REL; +- _raw_spin_unlock(&lock->raw); ++ _spin_unlock(lock); + local_irq_restore(flags); + } + + int _spin_is_locked(spinlock_t *lock) + { + check_lock(&lock->debug); +- return _raw_spin_is_locked(&lock->raw); ++ return lock->tickets.head != lock->tickets.tail; + } + + int _spin_trylock(spinlock_t *lock) + { ++ spinlock_tickets_t old, new; ++ + check_lock(&lock->debug); +- if ( !_raw_spin_trylock(&lock->raw) ) ++ old = observe_lock(&lock->tickets); ++ if ( old.head != old.tail ) ++ return 0; ++ new = old; ++ new.tail++; ++ if ( cmpxchg(&lock->tickets.head_tail, ++ old.head_tail, new.head_tail) != old.head_tail ) + return 0; + #ifdef LOCK_PROFILE + if (lock->profile) + lock->profile->time_locked = NOW(); + #endif + preempt_disable(); ++ /* ++ * cmpxchg() is a full barrier so no need for an ++ * arch_lock_acquire_barrier(). ++ */ + return 1; + } + + void _spin_barrier(spinlock_t *lock) + { ++ spinlock_tickets_t sample; + #ifdef LOCK_PROFILE + s_time_t block = NOW(); +- u64 loop = 0; ++#endif + + check_barrier(&lock->debug); +- do { smp_mb(); loop++;} while ( _raw_spin_is_locked(&lock->raw) ); +- if ((loop > 1) && lock->profile) ++ smp_mb(); ++ sample = observe_lock(&lock->tickets); ++ if ( sample.head != sample.tail ) + { +- lock->profile->time_block += NOW() - block; +- lock->profile->block_cnt++; +- } +-#else +- check_barrier(&lock->debug); +- do { smp_mb(); } while ( _raw_spin_is_locked(&lock->raw) ); ++ while ( observe_head(&lock->tickets) == sample.head ) ++ cpu_relax(); ++#ifdef LOCK_PROFILE ++ if ( lock->profile ) ++ { ++ lock->profile->time_block += NOW() - block; ++ lock->profile->block_cnt++; ++ } + #endif ++ } + smp_mb(); + } + + int _spin_trylock_recursive(spinlock_t *lock) + { +- int cpu = smp_processor_id(); ++ unsigned int cpu = smp_processor_id(); + + /* Don't allow overflow of recurse_cpu field. */ + BUILD_BUG_ON(NR_CPUS > 0xfffu); +@@ -256,8 +265,17 @@ int _spin_trylock_recursive(spinlock_t * + + void _spin_lock_recursive(spinlock_t *lock) + { +- while ( !spin_trylock_recursive(lock) ) +- cpu_relax(); ++ unsigned int cpu = smp_processor_id(); ++ ++ if ( likely(lock->recurse_cpu != cpu) ) ++ { ++ _spin_lock(lock); ++ lock->recurse_cpu = cpu; ++ } ++ ++ /* We support only fairly shallow recursion, else the counter overflows. */ ++ ASSERT(lock->recurse_cnt < 0xfu); ++ lock->recurse_cnt++; + } + + void _spin_unlock_recursive(spinlock_t *lock) +--- sle12sp1.orig/xen/include/asm-arm/system.h 2015-07-08 12:35:16.000000000 +0200 ++++ sle12sp1/xen/include/asm-arm/system.h 2015-07-08 12:37:59.000000000 +0200 +@@ -53,6 +53,9 @@ + + #define arch_fetch_and_add(x, v) __sync_fetch_and_add(x, v) + ++#define arch_lock_acquire_barrier() smp_mb() ++#define arch_lock_release_barrier() smp_mb() ++ + extern struct vcpu *__context_switch(struct vcpu *prev, struct vcpu *next); + + #endif +--- sle12sp1.orig/xen/include/asm-x86/system.h 2015-07-08 12:35:11.000000000 +0200 ++++ sle12sp1/xen/include/asm-x86/system.h 2015-07-08 12:37:59.000000000 +0200 +@@ -185,6 +185,17 @@ static always_inline unsigned long __xad + #define set_mb(var, value) do { xchg(&var, value); } while (0) + #define set_wmb(var, value) do { var = value; wmb(); } while (0) + ++/* ++ * On x86 the only reordering is of reads with older writes. In the ++ * lock case, the read in observe_head() can only be reordered with ++ * writes that precede it, and moving a write _into_ a locked section ++ * is OK. In the release case, the write in add_sized() can only be ++ * reordered with reads that follow it, and hoisting a read _into_ a ++ * locked region is OK. ++ */ ++#define arch_lock_acquire_barrier() barrier() ++#define arch_lock_release_barrier() barrier() ++ + #define local_irq_disable() asm volatile ( "cli" : : : "memory" ) + #define local_irq_enable() asm volatile ( "sti" : : : "memory" ) + +--- sle12sp1.orig/xen/include/xen/spinlock.h 2015-01-14 18:44:18.000000000 +0100 ++++ sle12sp1/xen/include/xen/spinlock.h 2015-07-08 12:37:59.000000000 +0200 +@@ -80,8 +80,7 @@ struct lock_profile_qhead { + static struct lock_profile *__lock_profile_##name \ + __used_section(".lockprofile.data") = \ + &__lock_profile_data_##name +-#define _SPIN_LOCK_UNLOCKED(x) { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0, \ +- _LOCK_DEBUG, x } ++#define _SPIN_LOCK_UNLOCKED(x) { { 0 }, 0xfffu, 0, _LOCK_DEBUG, x } + #define SPIN_LOCK_UNLOCKED _SPIN_LOCK_UNLOCKED(NULL) + #define DEFINE_SPINLOCK(l) \ + spinlock_t l = _SPIN_LOCK_UNLOCKED(NULL); \ +@@ -117,8 +116,7 @@ extern void spinlock_profile_reset(unsig + + struct lock_profile_qhead { }; + +-#define SPIN_LOCK_UNLOCKED \ +- { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0, _LOCK_DEBUG } ++#define SPIN_LOCK_UNLOCKED { { 0 }, 0xfffu, 0, _LOCK_DEBUG } + #define DEFINE_SPINLOCK(l) spinlock_t l = SPIN_LOCK_UNLOCKED + + #define spin_lock_init_prof(s, l) spin_lock_init(&((s)->l)) +@@ -127,8 +125,18 @@ struct lock_profile_qhead { }; + + #endif + ++typedef union { ++ u32 head_tail; ++ struct { ++ u16 head; ++ u16 tail; ++ }; ++} spinlock_tickets_t; ++ ++#define SPINLOCK_TICKET_INC { .head_tail = 0x10000, } ++ + typedef struct spinlock { +- raw_spinlock_t raw; ++ spinlock_tickets_t tickets; + u16 recurse_cpu:12; + u16 recurse_cnt:4; + struct lock_debug debug; diff --git a/5555a5b9-x86-arm-remove-asm-spinlock-h.patch b/5555a5b9-x86-arm-remove-asm-spinlock-h.patch new file mode 100644 index 0000000..18ab9ac --- /dev/null +++ b/5555a5b9-x86-arm-remove-asm-spinlock-h.patch @@ -0,0 +1,266 @@ +# Commit e62e49e6d5d4e8d22f3df0b75443ede65a812435 +# Date 2015-05-15 09:52:25 +0200 +# Author David Vrabel +# Committer Jan Beulich +x86,arm: remove asm/spinlock.h from all architectures + +Now that all architecture use a common ticket lock implementation for +spinlocks, remove the architecture specific byte lock implementations. + +Signed-off-by: David Vrabel +Reviewed-by: Tim Deegan +Acked-by: Jan Beulich +Acked-by: Ian Campbell + +--- sle12sp1.orig/xen/arch/arm/README.LinuxPrimitives 2015-01-14 18:44:18.000000000 +0100 ++++ sle12sp1/xen/arch/arm/README.LinuxPrimitives 2015-07-08 12:41:16.000000000 +0200 +@@ -25,16 +25,6 @@ linux/arch/arm64/include/asm/atomic.h + + --------------------------------------------------------------------- + +-spinlocks: last sync @ v3.16-rc6 (last commit: 95c4189689f9) +- +-linux/arch/arm64/include/asm/spinlock.h xen/include/asm-arm/arm64/spinlock.h +- +-Skipped: +- 5686b06 arm64: lockref: add support for lockless lockrefs using cmpxchg +- 52ea2a5 arm64: locks: introduce ticket-based spinlock implementation +- +---------------------------------------------------------------------- +- + mem*: last sync @ v3.16-rc6 (last commit: d875c9b37240) + + linux/arch/arm64/lib/memchr.S xen/arch/arm/arm64/lib/memchr.S +@@ -103,24 +93,6 @@ linux/arch/arm/include/asm/atomic.h + + --------------------------------------------------------------------- + +-spinlocks: last sync: 15e7e5c1ebf5 +- +-linux/arch/arm/include/asm/spinlock.h xen/include/asm-arm/arm32/spinlock.h +- +-*** Linux has switched to ticket locks but we still use bitlocks. +- +-resync to v3.14-rc7: +- +- 7c8746a ARM: 7955/1: spinlock: ensure we have a compiler barrier before sev +- 0cbad9c ARM: 7854/1: lockref: add support for lockless lockrefs using cmpxchg64 +- 9bb17be ARM: locks: prefetch the destination word for write prior to strex +- 27a8479 ARM: smp_on_up: move inline asm ALT_SMP patching macro out of spinlock. +- 00efaa0 ARM: 7812/1: rwlocks: retry trylock operation if strex fails on free lo +- afa31d8 ARM: 7811/1: locks: use early clobber in arch_spin_trylock +- 73a6fdc ARM: spinlock: use inner-shareable dsb variant prior to sev instruction +- +---------------------------------------------------------------------- +- + mem*: last sync @ v3.16-rc6 (last commit: d98b90ea22b0) + + linux/arch/arm/lib/copy_template.S xen/arch/arm/arm32/lib/copy_template.S +--- sle12sp1.orig/xen/include/asm-arm/arm32/spinlock.h 2015-01-14 18:44:18.000000000 +0100 ++++ /dev/null 1970-01-01 00:00:00.000000000 +0000 +@@ -1,66 +0,0 @@ +-#ifndef __ASM_ARM32_SPINLOCK_H +-#define __ASM_ARM32_SPINLOCK_H +- +-static inline void dsb_sev(void) +-{ +- __asm__ __volatile__ ( +- "dsb\n" +- "sev\n" +- ); +-} +- +-typedef struct { +- volatile unsigned int lock; +-} raw_spinlock_t; +- +-#define _RAW_SPIN_LOCK_UNLOCKED { 0 } +- +-#define _raw_spin_is_locked(x) ((x)->lock != 0) +- +-static always_inline void _raw_spin_unlock(raw_spinlock_t *lock) +-{ +- ASSERT(_raw_spin_is_locked(lock)); +- +- smp_mb(); +- +- __asm__ __volatile__( +-" str %1, [%0]\n" +- : +- : "r" (&lock->lock), "r" (0) +- : "cc"); +- +- dsb_sev(); +-} +- +-static always_inline int _raw_spin_trylock(raw_spinlock_t *lock) +-{ +- unsigned long contended, res; +- +- do { +- __asm__ __volatile__( +- " ldrex %0, [%2]\n" +- " teq %0, #0\n" +- " strexeq %1, %3, [%2]\n" +- " movne %1, #0\n" +- : "=&r" (contended), "=r" (res) +- : "r" (&lock->lock), "r" (1) +- : "cc"); +- } while (res); +- +- if (!contended) { +- smp_mb(); +- return 1; +- } else { +- return 0; +- } +-} +- +-#endif /* __ASM_SPINLOCK_H */ +-/* +- * Local variables: +- * mode: C +- * c-file-style: "BSD" +- * c-basic-offset: 4 +- * indent-tabs-mode: nil +- * End: +- */ +--- sle12sp1.orig/xen/include/asm-arm/arm64/spinlock.h 2015-01-14 18:44:18.000000000 +0100 ++++ /dev/null 1970-01-01 00:00:00.000000000 +0000 +@@ -1,63 +0,0 @@ +-/* +- * Derived from Linux arch64 spinlock.h which is: +- * Copyright (C) 2012 ARM Ltd. +- * +- * This program is free software; you can redistribute it and/or modify +- * it under the terms of the GNU General Public License version 2 as +- * published by the Free Software Foundation. +- * +- * This program is distributed in the hope that it will be useful, +- * but WITHOUT ANY WARRANTY; without even the implied warranty of +- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +- * GNU General Public License for more details. +- * +- * You should have received a copy of the GNU General Public License +- * along with this program. If not, see . +- */ +- +-#ifndef __ASM_ARM64_SPINLOCK_H +-#define __ASM_ARM64_SPINLOCK_H +- +-typedef struct { +- volatile unsigned int lock; +-} raw_spinlock_t; +- +-#define _RAW_SPIN_LOCK_UNLOCKED { 0 } +- +-#define _raw_spin_is_locked(x) ((x)->lock != 0) +- +-static always_inline void _raw_spin_unlock(raw_spinlock_t *lock) +-{ +- ASSERT(_raw_spin_is_locked(lock)); +- +- asm volatile( +- " stlr %w1, %0\n" +- : "=Q" (lock->lock) : "r" (0) : "memory"); +-} +- +-static always_inline int _raw_spin_trylock(raw_spinlock_t *lock) +-{ +- unsigned int tmp; +- +- asm volatile( +- "2: ldaxr %w0, %1\n" +- " cbnz %w0, 1f\n" +- " stxr %w0, %w2, %1\n" +- " cbnz %w0, 2b\n" +- "1:\n" +- : "=&r" (tmp), "+Q" (lock->lock) +- : "r" (1) +- : "cc", "memory"); +- +- return !tmp; +-} +- +-#endif /* __ASM_SPINLOCK_H */ +-/* +- * Local variables: +- * mode: C +- * c-file-style: "BSD" +- * c-basic-offset: 4 +- * indent-tabs-mode: nil +- * End: +- */ +--- sle12sp1.orig/xen/include/asm-arm/spinlock.h 2013-07-09 20:57:12.000000000 +0200 ++++ /dev/null 1970-01-01 00:00:00.000000000 +0000 +@@ -1,23 +0,0 @@ +-#ifndef __ASM_SPINLOCK_H +-#define __ASM_SPINLOCK_H +- +-#include +-#include +- +-#if defined(CONFIG_ARM_32) +-# include +-#elif defined(CONFIG_ARM_64) +-# include +-#else +-# error "unknown ARM variant" +-#endif +- +-#endif /* __ASM_SPINLOCK_H */ +-/* +- * Local variables: +- * mode: C +- * c-file-style: "BSD" +- * c-basic-offset: 4 +- * indent-tabs-mode: nil +- * End: +- */ +--- sle12sp1.orig/xen/include/asm-x86/spinlock.h 2015-01-14 18:44:18.000000000 +0100 ++++ /dev/null 1970-01-01 00:00:00.000000000 +0000 +@@ -1,34 +0,0 @@ +-#ifndef __ASM_SPINLOCK_H +-#define __ASM_SPINLOCK_H +- +-#include +-#include +-#include +- +-typedef struct { +- volatile s16 lock; +-} raw_spinlock_t; +- +-#define _RAW_SPIN_LOCK_UNLOCKED /*(raw_spinlock_t)*/ { 1 } +- +-#define _raw_spin_is_locked(x) ((x)->lock <= 0) +- +-static always_inline void _raw_spin_unlock(raw_spinlock_t *lock) +-{ +- ASSERT(_raw_spin_is_locked(lock)); +- asm volatile ( +- "movw $1,%0" +- : "=m" (lock->lock) : : "memory" ); +-} +- +-static always_inline int _raw_spin_trylock(raw_spinlock_t *lock) +-{ +- s16 oldval; +- asm volatile ( +- "xchgw %w0,%1" +- :"=r" (oldval), "=m" (lock->lock) +- :"0" ((s16)0) : "memory" ); +- return (oldval > 0); +-} +- +-#endif /* __ASM_SPINLOCK_H */ +--- sle12sp1.orig/xen/include/xen/spinlock.h 2015-07-08 12:37:59.000000000 +0200 ++++ sle12sp1/xen/include/xen/spinlock.h 2015-07-08 12:41:16.000000000 +0200 +@@ -2,7 +2,6 @@ + #define __SPINLOCK_H__ + + #include +-#include + + #ifndef NDEBUG + struct lock_debug { diff --git a/5555a8ec-introduce-non-contiguous-allocation.patch b/5555a8ec-introduce-non-contiguous-allocation.patch new file mode 100644 index 0000000..68c0ff6 --- /dev/null +++ b/5555a8ec-introduce-non-contiguous-allocation.patch @@ -0,0 +1,141 @@ +# Commit f278fcf19ce15f7b7ee69181560b5884a5e12b66 +# Date 2015-05-15 10:06:04 +0200 +# Author Roger Pau Monné +# Committer Jan Beulich +introduce a helper to allocate non-contiguous memory + +The allocator uses independent calls to alloc_domheap_pages in order to get +the desired amount of memory and then maps all the independent physical +addresses into a contiguous virtual address space. + +Signed-off-by: Roger Pau Monné +Tested-by: Julien Grall (ARM) +Reviewed-by: Tim Deegan + +# Commit 640f891eb258563bb155e577389e8c5e6541a59a +# Date 2015-05-21 08:57:19 +0200 +# Author Andrew Cooper +# Committer Jan Beulich +vmap: avoid hitting an ASSERT with vfree(NULL) + +and unconditionally defer the vm_size() call, as it doesn't have a NULL +short circuit. + +Reported-by: Wei Liu +Signed-off-by: Andrew Cooper +Tested-by: Wei Liu +Reviewed-by: Roger Pau Monné +Acked-by: Tim Deegan + +--- sle12sp1.orig/xen/common/vmap.c 2013-10-31 22:33:32.000000000 +0100 ++++ sle12sp1/xen/common/vmap.c 2015-07-08 14:18:50.000000000 +0200 +@@ -215,4 +215,75 @@ void vunmap(const void *va) + #endif + vm_free(va); + } ++ ++void *vmalloc(size_t size) ++{ ++ unsigned long *mfn; ++ size_t pages, i; ++ struct page_info *pg; ++ void *va; ++ ++ ASSERT(size); ++ ++ pages = PFN_UP(size); ++ mfn = xmalloc_array(unsigned long, pages); ++ if ( mfn == NULL ) ++ return NULL; ++ ++ for ( i = 0; i < pages; i++ ) ++ { ++ pg = alloc_domheap_page(NULL, 0); ++ if ( pg == NULL ) ++ goto error; ++ mfn[i] = page_to_mfn(pg); ++ } ++ ++ va = vmap(mfn, pages); ++ if ( va == NULL ) ++ goto error; ++ ++ xfree(mfn); ++ return va; ++ ++ error: ++ while ( i-- ) ++ free_domheap_page(mfn_to_page(mfn[i])); ++ xfree(mfn); ++ return NULL; ++} ++ ++void *vzalloc(size_t size) ++{ ++ void *p = vmalloc(size); ++ int i; ++ ++ if ( p == NULL ) ++ return NULL; ++ ++ for ( i = 0; i < size; i += PAGE_SIZE ) ++ clear_page(p + i); ++ ++ return p; ++} ++ ++void vfree(void *va) ++{ ++ unsigned int i, pages; ++ struct page_info *pg; ++ PAGE_LIST_HEAD(pg_list); ++ ++ if ( !va ) ++ return; ++ ++ pages = vm_size(va); ++ ASSERT(pages); ++ ++ for ( i = 0; i < pages; i++ ) ++ page_list_add(vmap_to_page(va + i * PAGE_SIZE), &pg_list); ++ ++ vunmap(va); ++ ++ while ( (pg = page_list_remove_head(&pg_list)) != NULL ) ++ free_domheap_page(pg); ++} + #endif +--- sle12sp1.orig/xen/include/asm-arm/mm.h 2015-01-14 18:44:18.000000000 +0100 ++++ sle12sp1/xen/include/asm-arm/mm.h 2015-07-08 14:18:50.000000000 +0200 +@@ -208,6 +208,8 @@ static inline void __iomem *ioremap_wc(p + #define pfn_to_paddr(pfn) ((paddr_t)(pfn) << PAGE_SHIFT) + #define paddr_to_pfn(pa) ((unsigned long)((pa) >> PAGE_SHIFT)) + #define paddr_to_pdx(pa) pfn_to_pdx(paddr_to_pfn(pa)) ++#define vmap_to_mfn(va) paddr_to_pfn(virt_to_maddr((vaddr_t)va)) ++#define vmap_to_page(va) mfn_to_page(vmap_to_mfn(va)) + + /* Page-align address and convert to frame number format */ + #define paddr_to_pfn_aligned(paddr) paddr_to_pfn(PAGE_ALIGN(paddr)) +--- sle12sp1.orig/xen/include/asm-x86/page.h 2015-06-03 16:55:05.000000000 +0200 ++++ sle12sp1/xen/include/asm-x86/page.h 2015-07-08 14:18:50.000000000 +0200 +@@ -262,6 +262,8 @@ void copy_page_sse2(void *, const void * + #define pfn_to_paddr(pfn) __pfn_to_paddr(pfn) + #define paddr_to_pfn(pa) __paddr_to_pfn(pa) + #define paddr_to_pdx(pa) pfn_to_pdx(paddr_to_pfn(pa)) ++#define vmap_to_mfn(va) l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va))) ++#define vmap_to_page(va) mfn_to_page(vmap_to_mfn(va)) + + #endif /* !defined(__ASSEMBLY__) */ + +--- sle12sp1.orig/xen/include/xen/vmap.h 2013-07-09 20:57:12.000000000 +0200 ++++ sle12sp1/xen/include/xen/vmap.h 2015-07-08 14:18:50.000000000 +0200 +@@ -11,6 +11,9 @@ void *__vmap(const unsigned long *mfn, u + unsigned int nr, unsigned int align, unsigned int flags); + void *vmap(const unsigned long *mfn, unsigned int nr); + void vunmap(const void *); ++void *vmalloc(size_t size); ++void *vzalloc(size_t size); ++void vfree(void *va); + + void __iomem *ioremap(paddr_t, size_t); + diff --git a/55795a52-x86-vMSI-X-support-qword-MMIO-access.patch b/55795a52-x86-vMSI-X-support-qword-MMIO-access.patch new file mode 100644 index 0000000..c5b0920 --- /dev/null +++ b/55795a52-x86-vMSI-X-support-qword-MMIO-access.patch @@ -0,0 +1,97 @@ +# Commit 284ffb4f9b0d5c3a33c4c5bd87645d0cc342ca96 +# Date 2015-06-11 11:52:18 +0200 +# Author Jan Beulich +# Committer Jan Beulich +x86/vMSI-X: support qword MMIO access + +The specification explicitly provides for this, so we should have +supported this from the beginning. + +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper + +--- a/xen/arch/x86/hvm/vmsi.c ++++ b/xen/arch/x86/hvm/vmsi.c +@@ -223,7 +223,7 @@ static int msixtbl_read( + unsigned int nr_entry, index; + int r = X86EMUL_UNHANDLEABLE; + +- if ( len != 4 || (address & 3) ) ++ if ( (len != 4 && len != 8) || (address & (len - 1)) ) + return r; + + rcu_read_lock(&msixtbl_rcu_lock); +@@ -241,13 +241,25 @@ static int msixtbl_read( + !acc_bit(test, entry, nr_entry, index) ) + goto out; + *pval = entry->gentries[nr_entry].msi_ad[index]; ++ if ( len == 8 ) ++ { ++ if ( index ) ++ offset = PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET; ++ else if ( acc_bit(test, entry, nr_entry, 1) ) ++ *pval |= (u64)entry->gentries[nr_entry].msi_ad[1] << 32; ++ else ++ goto out; ++ } + } +- else ++ if ( offset == PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) + { + virt = msixtbl_addr_to_virt(entry, address); + if ( !virt ) + goto out; +- *pval = readl(virt); ++ if ( len == 4 ) ++ *pval = readl(virt); ++ else ++ *pval |= (u64)readl(virt) << 32; + } + + r = X86EMUL_OKAY; +@@ -268,7 +280,7 @@ static int msixtbl_write(struct vcpu *v, unsigned long address, + unsigned long flags, orig; + struct irq_desc *desc; + +- if ( len != 4 || (address & 3) ) ++ if ( (len != 4 && len != 8) || (address & (len - 1)) ) + return r; + + rcu_read_lock(&msixtbl_rcu_lock); +@@ -279,16 +291,23 @@ static int msixtbl_write(struct vcpu *v, unsigned long address, + nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; + + offset = address & (PCI_MSIX_ENTRY_SIZE - 1); +- if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) ++ if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) + { ++ index = offset / sizeof(uint32_t); + if ( nr_entry < MAX_MSIX_ACC_ENTRIES ) + { +- index = offset / sizeof(uint32_t); + entry->gentries[nr_entry].msi_ad[index] = val; + acc_bit(set, entry, nr_entry, index); ++ if ( len == 8 && !index ) ++ { ++ entry->gentries[nr_entry].msi_ad[1] = val >> 32; ++ acc_bit(set, entry, nr_entry, 1); ++ } + } + set_bit(nr_entry, &entry->table_flags); +- goto out; ++ if ( len != 8 || !index ) ++ goto out; ++ val >>= 32; + } + + /* Exit to device model when unmasking and address/data got modified. */ +@@ -352,7 +371,8 @@ static int msixtbl_write(struct vcpu *v, unsigned long address, + + unlock: + spin_unlock_irqrestore(&desc->lock, flags); +- r = X86EMUL_OKAY; ++ if ( len == 4 ) ++ r = X86EMUL_OKAY; + + out: + rcu_read_unlock(&msixtbl_rcu_lock); diff --git a/557eb55f-gnttab-per-active-entry-locking.patch b/557eb55f-gnttab-per-active-entry-locking.patch new file mode 100644 index 0000000..019f522 --- /dev/null +++ b/557eb55f-gnttab-per-active-entry-locking.patch @@ -0,0 +1,551 @@ +# Commit b4650e9a96d78b87ccf7deb4f74733ccfcc64db5 +# Date 2015-06-15 13:22:07 +0200 +# Author David Vrabel +# Committer Jan Beulich +gnttab: per-active entry locking + +Introduce a per-active entry spin lock to protect active entry state +The grant table lock must be locked before acquiring (locking) an +active entry. + +This is a step in reducing contention on the grant table lock, but +will only do so once the grant table lock is turned into a read-write +lock. + +Based on a patch originally by Matt Wilson . + +Signed-off-by: David Vrabel +Reviewed-by: Jan Beulich + +--- sle12sp1.orig/docs/misc/grant-tables.txt 2008-10-14 19:44:06.000000000 +0200 ++++ sle12sp1/docs/misc/grant-tables.txt 2015-07-08 13:49:42.000000000 +0200 +@@ -63,6 +63,7 @@ is complete. + act->domid : remote domain being granted rights + act->frame : machine frame being granted + act->pin : used to hold reference counts ++ act->lock : spinlock used to serialize access to active entry state + + Map tracking + ~~~~~~~~~~~~ +@@ -74,7 +75,46 @@ is complete. + matching map track entry is then removed, as if unmap had been invoked. + These are not used by the transfer mechanism. + map->domid : owner of the mapped frame +- map->ref_and_flags : grant reference, ro/rw, mapped for host or device access ++ map->ref : grant reference ++ map->flags : ro/rw, mapped for host or device access ++ ++******************************************************************************** ++ Locking ++ ~~~~~~~ ++ Xen uses several locks to serialize access to the internal grant table state. ++ ++ grant_table->lock : lock used to prevent readers from accessing ++ inconsistent grant table state such as current ++ version, partially initialized active table pages, ++ etc. ++ active_grant_entry->lock : spinlock used to serialize modifications to ++ active entries ++ ++ The primary lock for the grant table is a spinlock. All functions ++ that access members of struct grant_table must acquire the lock ++ around critical sections. ++ ++ Active entries are obtained by calling active_entry_acquire(gt, ref). ++ This function returns a pointer to the active entry after locking its ++ spinlock. The caller must hold the grant table lock for the gt in ++ question before calling active_entry_acquire(). This is because the ++ grant table can be dynamically extended via gnttab_grow_table() while ++ a domain is running and must be fully initialized. Once all access to ++ the active entry is complete, release the lock by calling ++ active_entry_release(act). ++ ++ Summary of rules for locking: ++ active_entry_acquire() and active_entry_release() can only be ++ called when holding the relevant grant table's lock. I.e.: ++ spin_lock(>->lock); ++ act = active_entry_acquire(gt, ref); ++ ... ++ active_entry_release(act); ++ spin_unlock(>->lock); ++ ++ Active entries cannot be acquired while holding the maptrack lock. ++ Multiple active entries can be acquired while holding the grant table ++ lock. + + ******************************************************************************** + +--- sle12sp1.orig/xen/common/grant_table.c 2015-06-26 15:38:17.000000000 +0200 ++++ sle12sp1/xen/common/grant_table.c 2015-07-08 13:49:42.000000000 +0200 +@@ -157,10 +157,13 @@ struct active_grant_entry { + in the page. */ + unsigned length:16; /* For sub-page grants, the length of the + grant. */ ++ spinlock_t lock; /* lock to protect access of this entry. ++ see docs/misc/grant-tables.txt for ++ locking protocol */ + }; + + #define ACGNT_PER_PAGE (PAGE_SIZE / sizeof(struct active_grant_entry)) +-#define active_entry(t, e) \ ++#define _active_entry(t, e) \ + ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE]) + + static inline void gnttab_flush_tlb(const struct domain *d) +@@ -188,6 +191,24 @@ nr_active_grant_frames(struct grant_tabl + return num_act_frames_from_sha_frames(nr_grant_frames(gt)); + } + ++static inline struct active_grant_entry * ++active_entry_acquire(struct grant_table *t, grant_ref_t e) ++{ ++ struct active_grant_entry *act; ++ ++ ASSERT(spin_is_locked(&t->lock)); ++ ++ act = &_active_entry(t, e); ++ spin_lock(&act->lock); ++ ++ return act; ++} ++ ++static inline void active_entry_release(struct active_grant_entry *act) ++{ ++ spin_unlock(&act->lock); ++} ++ + /* Check if the page has been paged out, or needs unsharing. + If rc == GNTST_okay, *page contains the page struct with a ref taken. + Caller must do put_page(*page). +@@ -505,7 +526,6 @@ static int grant_map_exists(const struct + unsigned long mfn, + unsigned int *ref_count) + { +- const struct active_grant_entry *act; + unsigned int ref, max_iter; + + ASSERT(spin_is_locked(&rgt->lock)); +@@ -514,18 +534,19 @@ static int grant_map_exists(const struct + nr_grant_entries(rgt)); + for ( ref = *ref_count; ref < max_iter; ref++ ) + { +- act = &active_entry(rgt, ref); ++ struct active_grant_entry *act; ++ bool_t exists; + +- if ( !act->pin ) +- continue; ++ act = active_entry_acquire(rgt, ref); + +- if ( act->domid != ld->domain_id ) +- continue; ++ exists = act->pin ++ && act->domid == ld->domain_id ++ && act->frame == mfn; + +- if ( act->frame != mfn ) +- continue; ++ active_entry_release(act); + +- return 0; ++ if ( exists ) ++ return 0; + } + + if ( ref < nr_grant_entries(rgt) ) +@@ -546,13 +567,24 @@ static void mapcount( + + *wrc = *rdc = 0; + ++ /* ++ * Must have the local domain's grant table lock when iterating ++ * over its maptrack entries. ++ */ ++ ASSERT(spin_is_locked(&lgt->lock)); ++ /* ++ * Must have the remote domain's grant table lock while counting ++ * its active entries. ++ */ ++ ASSERT(spin_is_locked(&rd->grant_table->lock)); ++ + for ( handle = 0; handle < lgt->maptrack_limit; handle++ ) + { + map = &maptrack_entry(lgt, handle); + if ( !(map->flags & (GNTMAP_device_map|GNTMAP_host_map)) || + map->domid != rd->domain_id ) + continue; +- if ( active_entry(rd->grant_table, map->ref).frame == mfn ) ++ if ( _active_entry(rd->grant_table, map->ref).frame == mfn ) + (map->flags & GNTMAP_readonly) ? (*rdc)++ : (*wrc)++; + } + } +@@ -639,7 +671,7 @@ __gnttab_map_grant_ref( + if ( unlikely(op->ref >= nr_grant_entries(rgt))) + PIN_FAIL(unlock_out, GNTST_bad_gntref, "Bad ref (%d).\n", op->ref); + +- act = &active_entry(rgt, op->ref); ++ act = active_entry_acquire(rgt, op->ref); + shah = shared_entry_header(rgt, op->ref); + if (rgt->gt_version == 1) { + sha1 = &shared_entry_v1(rgt, op->ref); +@@ -656,7 +688,7 @@ __gnttab_map_grant_ref( + ((act->domid != ld->domain_id) || + (act->pin & 0x80808080U) != 0 || + (act->is_sub_page)) ) +- PIN_FAIL(unlock_out, GNTST_general_error, ++ PIN_FAIL(act_release_out, GNTST_general_error, + "Bad domain (%d != %d), or risk of counter overflow %08x, or subpage %d\n", + act->domid, ld->domain_id, act->pin, act->is_sub_page); + +@@ -667,7 +699,7 @@ __gnttab_map_grant_ref( + if ( (rc = _set_status(rgt->gt_version, ld->domain_id, + op->flags & GNTMAP_readonly, + 1, shah, act, status) ) != GNTST_okay ) +- goto unlock_out; ++ goto act_release_out; + + if ( !act->pin ) + { +@@ -702,6 +734,7 @@ __gnttab_map_grant_ref( + + cache_flags = (shah->flags & (GTF_PAT | GTF_PWT | GTF_PCD) ); + ++ active_entry_release(act); + spin_unlock(&rgt->lock); + + /* pg may be set, with a refcount included, from __get_paged_frame */ +@@ -839,7 +872,7 @@ __gnttab_map_grant_ref( + + spin_lock(&rgt->lock); + +- act = &active_entry(rgt, op->ref); ++ act = active_entry_acquire(rgt, op->ref); + + if ( op->flags & GNTMAP_device_map ) + act->pin -= (op->flags & GNTMAP_readonly) ? +@@ -856,6 +889,9 @@ __gnttab_map_grant_ref( + if ( !act->pin ) + gnttab_clear_flag(_GTF_reading, status); + ++ act_release_out: ++ active_entry_release(act); ++ + unlock_out: + spin_unlock(&rgt->lock); + op->status = rc; +@@ -950,7 +986,7 @@ __gnttab_unmap_common( + } + + op->rd = rd; +- act = &active_entry(rgt, op->map->ref); ++ act = active_entry_acquire(rgt, op->map->ref); + + if ( op->frame == 0 ) + { +@@ -959,7 +995,7 @@ __gnttab_unmap_common( + else + { + if ( unlikely(op->frame != act->frame) ) +- PIN_FAIL(unmap_out, GNTST_general_error, ++ PIN_FAIL(act_release_out, GNTST_general_error, + "Bad frame number doesn't match gntref. (%lx != %lx)\n", + op->frame, act->frame); + if ( op->flags & GNTMAP_device_map ) +@@ -978,7 +1014,7 @@ __gnttab_unmap_common( + if ( (rc = replace_grant_host_mapping(op->host_addr, + op->frame, op->new_addr, + op->flags)) < 0 ) +- goto unmap_out; ++ goto act_release_out; + + ASSERT(act->pin & (GNTPIN_hstw_mask | GNTPIN_hstr_mask)); + op->map->flags &= ~GNTMAP_host_map; +@@ -1000,7 +1036,7 @@ __gnttab_unmap_common( + if ( err ) + { + rc = GNTST_general_error; +- goto unmap_out; ++ goto act_release_out; + } + } + +@@ -1008,8 +1044,11 @@ __gnttab_unmap_common( + if ( !(op->flags & GNTMAP_readonly) ) + gnttab_mark_dirty(rd, op->frame); + ++ act_release_out: ++ active_entry_release(act); + unmap_out: + double_gt_unlock(lgt, rgt); ++ + op->status = rc; + rcu_unlock_domain(rd); + } +@@ -1042,9 +1081,9 @@ __gnttab_unmap_common_complete(struct gn + spin_lock(&rgt->lock); + + if ( rgt->gt_version == 0 ) +- goto unmap_out; ++ goto unlock_out; + +- act = &active_entry(rgt, op->map->ref); ++ act = active_entry_acquire(rgt, op->map->ref); + sha = shared_entry_header(rgt, op->map->ref); + + if ( rgt->gt_version == 1 ) +@@ -1058,7 +1097,7 @@ __gnttab_unmap_common_complete(struct gn + * Suggests that __gntab_unmap_common failed early and so + * nothing further to do + */ +- goto unmap_out; ++ goto act_release_out; + } + + pg = mfn_to_page(op->frame); +@@ -1082,7 +1121,7 @@ __gnttab_unmap_common_complete(struct gn + * Suggests that __gntab_unmap_common failed in + * replace_grant_host_mapping() so nothing further to do + */ +- goto unmap_out; ++ goto act_release_out; + } + + if ( !is_iomem_page(op->frame) ) +@@ -1103,8 +1142,11 @@ __gnttab_unmap_common_complete(struct gn + if ( act->pin == 0 ) + gnttab_clear_flag(_GTF_reading, status); + +- unmap_out: ++ act_release_out: ++ active_entry_release(act); ++ unlock_out: + spin_unlock(&rgt->lock); ++ + if ( put_handle ) + { + op->map->flags = 0; +@@ -1296,7 +1338,7 @@ gnttab_grow_table(struct domain *d, unsi + /* d's grant table lock must be held by the caller */ + + struct grant_table *gt = d->grant_table; +- unsigned int i; ++ unsigned int i, j; + + ASSERT(req_nr_frames <= max_grant_frames); + +@@ -1311,6 +1353,8 @@ gnttab_grow_table(struct domain *d, unsi + if ( (gt->active[i] = alloc_xenheap_page()) == NULL ) + goto active_alloc_failed; + clear_page(gt->active[i]); ++ for ( j = 0; j < ACGNT_PER_PAGE; j++ ) ++ spin_lock_init(>->active[i][j].lock); + } + + /* Shared */ +@@ -1805,7 +1849,7 @@ __release_grant_for_copy( + + spin_lock(&rgt->lock); + +- act = &active_entry(rgt, gref); ++ act = active_entry_acquire(rgt, gref); + sha = shared_entry_header(rgt, gref); + r_frame = act->frame; + +@@ -1844,6 +1888,7 @@ __release_grant_for_copy( + released_read = 1; + } + ++ active_entry_release(act); + spin_unlock(&rgt->lock); + + if ( td != rd ) +@@ -1905,14 +1950,14 @@ __acquire_grant_for_copy( + spin_lock(&rgt->lock); + + if ( rgt->gt_version == 0 ) +- PIN_FAIL(unlock_out, GNTST_general_error, ++ PIN_FAIL(gt_unlock_out, GNTST_general_error, + "remote grant table not ready\n"); + + if ( unlikely(gref >= nr_grant_entries(rgt)) ) +- PIN_FAIL(unlock_out, GNTST_bad_gntref, ++ PIN_FAIL(gt_unlock_out, GNTST_bad_gntref, + "Bad grant reference %ld\n", gref); + +- act = &active_entry(rgt, gref); ++ act = active_entry_acquire(rgt, gref); + shah = shared_entry_header(rgt, gref); + if ( rgt->gt_version == 1 ) + { +@@ -1971,6 +2016,13 @@ __acquire_grant_for_copy( + PIN_FAIL(unlock_out_clear, GNTST_general_error, + "transitive grant referenced bad domain %d\n", + trans_domid); ++ ++ /* ++ * __acquire_grant_for_copy() could take the lock on the ++ * remote table (if rd == td), so we have to drop the lock ++ * here and reacquire ++ */ ++ active_entry_release(act); + spin_unlock(&rgt->lock); + + rc = __acquire_grant_for_copy(td, trans_gref, rd->domain_id, +@@ -1978,9 +2030,12 @@ __acquire_grant_for_copy( + &trans_page_off, &trans_length, 0); + + spin_lock(&rgt->lock); ++ act = active_entry_acquire(rgt, gref); ++ + if ( rc != GNTST_okay ) { + __fixup_status_for_copy_pin(act, status); + rcu_unlock_domain(td); ++ active_entry_release(act); + spin_unlock(&rgt->lock); + return rc; + } +@@ -1993,6 +2048,7 @@ __acquire_grant_for_copy( + { + __fixup_status_for_copy_pin(act, status); + rcu_unlock_domain(td); ++ active_entry_release(act); + spin_unlock(&rgt->lock); + put_page(*page); + return __acquire_grant_for_copy(rd, gref, ldom, readonly, +@@ -2061,6 +2117,7 @@ __acquire_grant_for_copy( + *length = act->length; + *frame = act->frame; + ++ active_entry_release(act); + spin_unlock(&rgt->lock); + return rc; + +@@ -2073,7 +2130,11 @@ __acquire_grant_for_copy( + gnttab_clear_flag(_GTF_reading, status); + + unlock_out: ++ active_entry_release(act); ++ ++ gt_unlock_out: + spin_unlock(&rgt->lock); ++ + return rc; + } + +@@ -2231,7 +2292,6 @@ gnttab_set_version(XEN_GUEST_HANDLE_PARA + gnttab_set_version_t op; + struct domain *d = current->domain; + struct grant_table *gt = d->grant_table; +- struct active_grant_entry *act; + grant_entry_v1_t reserved_entries[GNTTAB_NR_RESERVED_ENTRIES]; + long res; + int i; +@@ -2256,8 +2316,7 @@ gnttab_set_version(XEN_GUEST_HANDLE_PARA + { + for ( i = GNTTAB_NR_RESERVED_ENTRIES; i < nr_grant_entries(gt); i++ ) + { +- act = &active_entry(gt, i); +- if ( act->pin != 0 ) ++ if ( read_atomic(&_active_entry(gt, i).pin) != 0 ) + { + gdprintk(XENLOG_WARNING, + "tried to change grant table version from %d to %d, but some grant entries still in use\n", +@@ -2444,7 +2503,8 @@ __gnttab_swap_grant_ref(grant_ref_t ref_ + { + struct domain *d = rcu_lock_current_domain(); + struct grant_table *gt = d->grant_table; +- struct active_grant_entry *act; ++ struct active_grant_entry *act_a = NULL; ++ struct active_grant_entry *act_b = NULL; + s16 rc = GNTST_okay; + + spin_lock(>->lock); +@@ -2458,12 +2518,16 @@ __gnttab_swap_grant_ref(grant_ref_t ref_ + if ( unlikely(ref_b >= nr_grant_entries(d->grant_table))) + PIN_FAIL(out, GNTST_bad_gntref, "Bad ref-b (%d).\n", ref_b); + +- act = &active_entry(gt, ref_a); +- if ( act->pin ) ++ /* Swapping the same ref is a no-op. */ ++ if ( ref_a == ref_b ) ++ goto out; ++ ++ act_a = active_entry_acquire(gt, ref_a); ++ if ( act_a->pin ) + PIN_FAIL(out, GNTST_eagain, "ref a %ld busy\n", (long)ref_a); + +- act = &active_entry(gt, ref_b); +- if ( act->pin ) ++ act_b = active_entry_acquire(gt, ref_b); ++ if ( act_b->pin ) + PIN_FAIL(out, GNTST_eagain, "ref b %ld busy\n", (long)ref_b); + + if ( gt->gt_version == 1 ) +@@ -2490,6 +2554,10 @@ __gnttab_swap_grant_ref(grant_ref_t ref_ + } + + out: ++ if ( act_b != NULL ) ++ active_entry_release(act_b); ++ if ( act_a != NULL ) ++ active_entry_release(act_a); + spin_unlock(>->lock); + + rcu_unlock_domain(d); +@@ -2799,7 +2867,7 @@ grant_table_create( + struct domain *d) + { + struct grant_table *t; +- int i; ++ unsigned int i, j; + + if ( (t = xzalloc(struct grant_table)) == NULL ) + goto no_mem_0; +@@ -2818,6 +2886,8 @@ grant_table_create( + if ( (t->active[i] = alloc_xenheap_page()) == NULL ) + goto no_mem_2; + clear_page(t->active[i]); ++ for ( j = 0; j < ACGNT_PER_PAGE; j++ ) ++ spin_lock_init(&t->active[i][j].lock); + } + + /* Tracking of mapped foreign frames table */ +@@ -2914,7 +2984,7 @@ gnttab_release_mappings( + rgt = rd->grant_table; + spin_lock(&rgt->lock); + +- act = &active_entry(rgt, ref); ++ act = active_entry_acquire(rgt, ref); + sha = shared_entry_header(rgt, ref); + if (rgt->gt_version == 1) + status = &sha->flags; +@@ -2972,6 +3042,7 @@ gnttab_release_mappings( + if ( act->pin == 0 ) + gnttab_clear_flag(_GTF_reading, status); + ++ active_entry_release(act); + spin_unlock(&rgt->lock); + + rcu_unlock_domain(rd); +@@ -3034,9 +3105,12 @@ static void gnttab_usage_print(struct do + uint16_t status; + uint64_t frame; + +- act = &active_entry(gt, ref); ++ act = active_entry_acquire(gt, ref); + if ( !act->pin ) ++ { ++ active_entry_release(act); + continue; ++ } + + sha = shared_entry_header(gt, ref); + +@@ -3066,6 +3140,7 @@ static void gnttab_usage_print(struct do + printk("[%3d] %5d 0x%06lx 0x%08x %5d 0x%06"PRIx64" 0x%02x\n", + ref, act->domid, act->frame, act->pin, + sha->domid, frame, status); ++ active_entry_release(act); + } + + out: diff --git a/557eb5b6-gnttab-introduce-maptrack-lock.patch b/557eb5b6-gnttab-introduce-maptrack-lock.patch new file mode 100644 index 0000000..a087af6 --- /dev/null +++ b/557eb5b6-gnttab-introduce-maptrack-lock.patch @@ -0,0 +1,86 @@ +# Commit 5a9899ddc42040e139233a6b1f0f65f3b65eda6d +# Date 2015-06-15 13:23:34 +0200 +# Author David Vrabel +# Committer Jan Beulich +gnttab: introduce maptrack lock + +Split grant table lock into two separate locks. One to protect +maptrack free list (maptrack_lock) and one for everything else (lock). + +Based on a patch originally by Matt Wilson . + +Signed-off-by: David Vrabel +Reviewed-by: Jan Beulich + +--- sle12sp1.orig/docs/misc/grant-tables.txt 2015-07-08 13:49:42.000000000 +0200 ++++ sle12sp1/docs/misc/grant-tables.txt 2015-07-08 13:49:46.000000000 +0200 +@@ -87,6 +87,7 @@ is complete. + inconsistent grant table state such as current + version, partially initialized active table pages, + etc. ++ grant_table->maptrack_lock : spinlock used to protect the maptrack free list + active_grant_entry->lock : spinlock used to serialize modifications to + active entries + +@@ -94,6 +95,9 @@ is complete. + that access members of struct grant_table must acquire the lock + around critical sections. + ++ The maptrack free list is protected by its own spinlock. The maptrack ++ lock may be locked while holding the grant table lock. ++ + Active entries are obtained by calling active_entry_acquire(gt, ref). + This function returns a pointer to the active entry after locking its + spinlock. The caller must hold the grant table lock for the gt in +--- sle12sp1.orig/xen/common/grant_table.c 2015-07-08 13:49:42.000000000 +0200 ++++ sle12sp1/xen/common/grant_table.c 2015-07-08 13:49:46.000000000 +0200 +@@ -288,10 +288,10 @@ static inline void + put_maptrack_handle( + struct grant_table *t, int handle) + { +- spin_lock(&t->lock); ++ spin_lock(&t->maptrack_lock); + maptrack_entry(t, handle).ref = t->maptrack_head; + t->maptrack_head = handle; +- spin_unlock(&t->lock); ++ spin_unlock(&t->maptrack_lock); + } + + static inline int +@@ -303,7 +303,7 @@ get_maptrack_handle( + struct grant_mapping *new_mt; + unsigned int new_mt_limit, nr_frames; + +- spin_lock(&lgt->lock); ++ spin_lock(&lgt->maptrack_lock); + + while ( unlikely((handle = __get_maptrack_handle(lgt)) == -1) ) + { +@@ -332,7 +332,7 @@ get_maptrack_handle( + nr_frames + 1); + } + +- spin_unlock(&lgt->lock); ++ spin_unlock(&lgt->maptrack_lock); + + return handle; + } +@@ -2874,6 +2874,7 @@ grant_table_create( + + /* Simple stuff. */ + spin_lock_init(&t->lock); ++ spin_lock_init(&t->maptrack_lock); + t->nr_grant_frames = INITIAL_NR_GRANT_FRAMES; + + /* Active grant table. */ +--- sle12sp1.orig/xen/include/xen/grant_table.h 2015-01-14 18:44:18.000000000 +0100 ++++ sle12sp1/xen/include/xen/grant_table.h 2015-07-08 13:49:46.000000000 +0200 +@@ -82,6 +82,8 @@ struct grant_table { + struct grant_mapping **maptrack; + unsigned int maptrack_head; + unsigned int maptrack_limit; ++ /* Lock protecting the maptrack page list, head, and limit */ ++ spinlock_t maptrack_lock; + /* Lock protecting updates to active and shared grant tables. */ + spinlock_t lock; + /* The defined versions are 1 and 2. Set to 0 if we don't know diff --git a/557eb620-gnttab-make-the-grant-table-lock-a-read-write-lock.patch b/557eb620-gnttab-make-the-grant-table-lock-a-read-write-lock.patch new file mode 100644 index 0000000..04c482f --- /dev/null +++ b/557eb620-gnttab-make-the-grant-table-lock-a-read-write-lock.patch @@ -0,0 +1,733 @@ +# Commit 40de9fffb4cc0b0485aa3391d72e2220b8e1ce12 +# Date 2015-06-15 13:25:20 +0200 +# Author David Vrabel +# Committer Jan Beulich +gnttab: make the grant table lock a read-write lock + +In combination with the per-active entry locks, the grant table lock +can be made a read-write lock since the majority of cases only the +read lock is required. The grant table read lock protects against +changes to the table version or size (which are done with the write +lock held). + +The write lock is also required when two active entries must be +acquired. + +The double lock is still required when updating IOMMU page tables. + +With the lock contention being only on the maptrack lock (unless IOMMU +updates are required), performance and scalability is improved. + +Based on a patch originally by Matt Wilson . + +Signed-off-by: David Vrabel +Reviewed-by: Jan Beulich + +--- sle12sp1.orig/docs/misc/grant-tables.txt 2015-07-08 13:49:46.000000000 +0200 ++++ sle12sp1/docs/misc/grant-tables.txt 2015-07-08 13:49:47.000000000 +0200 +@@ -83,7 +83,7 @@ is complete. + ~~~~~~~ + Xen uses several locks to serialize access to the internal grant table state. + +- grant_table->lock : lock used to prevent readers from accessing ++ grant_table->lock : rwlock used to prevent readers from accessing + inconsistent grant table state such as current + version, partially initialized active table pages, + etc. +@@ -91,34 +91,43 @@ is complete. + active_grant_entry->lock : spinlock used to serialize modifications to + active entries + +- The primary lock for the grant table is a spinlock. All functions +- that access members of struct grant_table must acquire the lock +- around critical sections. ++ The primary lock for the grant table is a read/write spinlock. All ++ functions that access members of struct grant_table must acquire a ++ read lock around critical sections. Any modification to the members ++ of struct grant_table (e.g., nr_status_frames, nr_grant_frames, ++ active frames, etc.) must only be made if the write lock is ++ held. These elements are read-mostly, and read critical sections can ++ be large, which makes a rwlock a good choice. + + The maptrack free list is protected by its own spinlock. The maptrack + lock may be locked while holding the grant table lock. + + Active entries are obtained by calling active_entry_acquire(gt, ref). + This function returns a pointer to the active entry after locking its +- spinlock. The caller must hold the grant table lock for the gt in +- question before calling active_entry_acquire(). This is because the +- grant table can be dynamically extended via gnttab_grow_table() while +- a domain is running and must be fully initialized. Once all access to +- the active entry is complete, release the lock by calling +- active_entry_release(act). ++ spinlock. The caller must hold the grant table read lock before ++ calling active_entry_acquire(). This is because the grant table can ++ be dynamically extended via gnttab_grow_table() while a domain is ++ running and must be fully initialized. Once all access to the active ++ entry is complete, release the lock by calling active_entry_release(act). + + Summary of rules for locking: + active_entry_acquire() and active_entry_release() can only be +- called when holding the relevant grant table's lock. I.e.: +- spin_lock(>->lock); ++ called when holding the relevant grant table's read lock. I.e.: ++ read_lock(>->lock); + act = active_entry_acquire(gt, ref); + ... + active_entry_release(act); +- spin_unlock(>->lock); ++ read_unlock(>->lock); + + Active entries cannot be acquired while holding the maptrack lock. + Multiple active entries can be acquired while holding the grant table +- lock. ++ _write_ lock. ++ ++ Maptrack entries are protected by the corresponding active entry ++ lock. As an exception, new maptrack entries may be populated without ++ holding the lock, provided the flags field is written last. This ++ requires any maptrack entry user validates the flags field as ++ non-zero first. + + ******************************************************************************** + +--- sle12sp1.orig/xen/arch/arm/mm.c 2015-01-14 18:44:18.000000000 +0100 ++++ sle12sp1/xen/arch/arm/mm.c 2015-07-08 13:49:47.000000000 +0200 +@@ -1037,7 +1037,7 @@ int xenmem_add_to_physmap_one( + switch ( space ) + { + case XENMAPSPACE_grant_table: +- spin_lock(&d->grant_table->lock); ++ write_lock(&d->grant_table->lock); + + if ( d->grant_table->gt_version == 0 ) + d->grant_table->gt_version = 1; +@@ -1067,7 +1067,7 @@ int xenmem_add_to_physmap_one( + + t = p2m_ram_rw; + +- spin_unlock(&d->grant_table->lock); ++ write_unlock(&d->grant_table->lock); + break; + case XENMAPSPACE_shared_info: + if ( idx != 0 ) +--- sle12sp1.orig/xen/arch/x86/mm.c 2015-07-08 00:00:00.000000000 +0200 ++++ sle12sp1/xen/arch/x86/mm.c 2015-07-08 13:49:47.000000000 +0200 +@@ -4594,7 +4594,7 @@ int xenmem_add_to_physmap_one( + mfn = virt_to_mfn(d->shared_info); + break; + case XENMAPSPACE_grant_table: +- spin_lock(&d->grant_table->lock); ++ write_lock(&d->grant_table->lock); + + if ( d->grant_table->gt_version == 0 ) + d->grant_table->gt_version = 1; +@@ -4616,7 +4616,7 @@ int xenmem_add_to_physmap_one( + mfn = virt_to_mfn(d->grant_table->shared_raw[idx]); + } + +- spin_unlock(&d->grant_table->lock); ++ write_unlock(&d->grant_table->lock); + break; + case XENMAPSPACE_gmfn_range: + case XENMAPSPACE_gmfn: +--- sle12sp1.orig/xen/common/grant_table.c 2015-07-08 13:49:46.000000000 +0200 ++++ sle12sp1/xen/common/grant_table.c 2015-07-08 13:49:47.000000000 +0200 +@@ -196,7 +196,7 @@ active_entry_acquire(struct grant_table + { + struct active_grant_entry *act; + +- ASSERT(spin_is_locked(&t->lock)); ++ ASSERT(rw_is_locked(&t->lock)); + + act = &_active_entry(t, e); + spin_lock(&act->lock); +@@ -252,25 +252,29 @@ static int __get_paged_frame(unsigned lo + static inline void + double_gt_lock(struct grant_table *lgt, struct grant_table *rgt) + { ++ /* ++ * See mapcount() for why the write lock is also required for the ++ * remote domain. ++ */ + if ( lgt < rgt ) + { +- spin_lock(&lgt->lock); +- spin_lock(&rgt->lock); ++ write_lock(&lgt->lock); ++ write_lock(&rgt->lock); + } + else + { + if ( lgt != rgt ) +- spin_lock(&rgt->lock); +- spin_lock(&lgt->lock); ++ write_lock(&rgt->lock); ++ write_lock(&lgt->lock); + } + } + + static inline void + double_gt_unlock(struct grant_table *lgt, struct grant_table *rgt) + { +- spin_unlock(&lgt->lock); ++ write_unlock(&lgt->lock); + if ( lgt != rgt ) +- spin_unlock(&rgt->lock); ++ write_unlock(&rgt->lock); + } + + static inline int +@@ -528,7 +532,7 @@ static int grant_map_exists(const struct + { + unsigned int ref, max_iter; + +- ASSERT(spin_is_locked(&rgt->lock)); ++ ASSERT(rw_is_locked(&rgt->lock)); + + max_iter = min(*ref_count + (1 << GNTTABOP_CONTINUATION_ARG_SHIFT), + nr_grant_entries(rgt)); +@@ -568,15 +572,15 @@ static void mapcount( + *wrc = *rdc = 0; + + /* +- * Must have the local domain's grant table lock when iterating +- * over its maptrack entries. ++ * Must have the local domain's grant table write lock when ++ * iterating over its maptrack entries. + */ +- ASSERT(spin_is_locked(&lgt->lock)); ++ ASSERT(rw_is_write_locked(&lgt->lock)); + /* +- * Must have the remote domain's grant table lock while counting +- * its active entries. ++ * Must have the remote domain's grant table write lock while ++ * counting its active entries. + */ +- ASSERT(spin_is_locked(&rd->grant_table->lock)); ++ ASSERT(rw_is_write_locked(&rd->grant_table->lock)); + + for ( handle = 0; handle < lgt->maptrack_limit; handle++ ) + { +@@ -616,6 +620,7 @@ __gnttab_map_grant_ref( + grant_entry_v2_t *sha2; + grant_entry_header_t *shah; + uint16_t *status; ++ bool_t need_iommu; + + led = current; + ld = led->domain; +@@ -661,7 +666,7 @@ __gnttab_map_grant_ref( + } + + rgt = rd->grant_table; +- spin_lock(&rgt->lock); ++ read_lock(&rgt->lock); + + if ( rgt->gt_version == 0 ) + PIN_FAIL(unlock_out, GNTST_general_error, +@@ -735,7 +740,7 @@ __gnttab_map_grant_ref( + cache_flags = (shah->flags & (GTF_PAT | GTF_PWT | GTF_PCD) ); + + active_entry_release(act); +- spin_unlock(&rgt->lock); ++ read_unlock(&rgt->lock); + + /* pg may be set, with a refcount included, from __get_paged_frame */ + if ( !pg ) +@@ -811,12 +816,14 @@ __gnttab_map_grant_ref( + goto undo_out; + } + +- double_gt_lock(lgt, rgt); +- +- if ( gnttab_need_iommu_mapping(ld) ) ++ need_iommu = gnttab_need_iommu_mapping(ld); ++ if ( need_iommu ) + { + unsigned int wrc, rdc; + int err = 0; ++ ++ double_gt_lock(lgt, rgt); ++ + /* We're not translated, so we know that gmfns and mfns are + the same things, so the IOMMU entry is always 1-to-1. */ + mapcount(lgt, rd, frame, &wrc, &rdc); +@@ -842,12 +849,22 @@ __gnttab_map_grant_ref( + + TRACE_1D(TRC_MEM_PAGE_GRANT_MAP, op->dom); + ++ /* ++ * All maptrack entry users check mt->flags first before using the ++ * other fields so just ensure the flags field is stored last. ++ * ++ * However, if gnttab_need_iommu_mapping() then this would race ++ * with a concurrent mapcount() call (on an unmap, for example) ++ * and a lock is required. ++ */ + mt = &maptrack_entry(lgt, handle); + mt->domid = op->dom; + mt->ref = op->ref; +- mt->flags = op->flags; ++ wmb(); ++ write_atomic(&mt->flags, op->flags); + +- double_gt_unlock(lgt, rgt); ++ if ( need_iommu ) ++ double_gt_unlock(lgt, rgt); + + op->dev_bus_addr = (u64)frame << PAGE_SHIFT; + op->handle = handle; +@@ -870,7 +887,7 @@ __gnttab_map_grant_ref( + put_page(pg); + } + +- spin_lock(&rgt->lock); ++ read_lock(&rgt->lock); + + act = active_entry_acquire(rgt, op->ref); + +@@ -893,7 +910,7 @@ __gnttab_map_grant_ref( + active_entry_release(act); + + unlock_out: +- spin_unlock(&rgt->lock); ++ read_unlock(&rgt->lock); + op->status = rc; + put_maptrack_handle(lgt, handle); + rcu_unlock_domain(rd); +@@ -943,18 +960,19 @@ __gnttab_unmap_common( + } + + op->map = &maptrack_entry(lgt, op->handle); +- spin_lock(&lgt->lock); + +- if ( unlikely(!op->map->flags) ) ++ read_lock(&lgt->lock); ++ ++ if ( unlikely(!read_atomic(&op->map->flags)) ) + { +- spin_unlock(&lgt->lock); ++ read_unlock(&lgt->lock); + gdprintk(XENLOG_INFO, "Zero flags for handle (%d).\n", op->handle); + op->status = GNTST_bad_handle; + return; + } + + dom = op->map->domid; +- spin_unlock(&lgt->lock); ++ read_unlock(&lgt->lock); + + if ( unlikely((rd = rcu_lock_domain_by_id(dom)) == NULL) ) + { +@@ -975,9 +993,10 @@ __gnttab_unmap_common( + TRACE_1D(TRC_MEM_PAGE_GRANT_UNMAP, dom); + + rgt = rd->grant_table; +- double_gt_lock(lgt, rgt); + +- op->flags = op->map->flags; ++ read_lock(&rgt->lock); ++ ++ op->flags = read_atomic(&op->map->flags); + if ( unlikely(!op->flags) || unlikely(op->map->domid != dom) ) + { + gdprintk(XENLOG_WARNING, "Unstable handle %u\n", op->handle); +@@ -1024,31 +1043,34 @@ __gnttab_unmap_common( + act->pin -= GNTPIN_hstw_inc; + } + +- if ( gnttab_need_iommu_mapping(ld) ) ++ act_release_out: ++ active_entry_release(act); ++ unmap_out: ++ read_unlock(&rgt->lock); ++ ++ if ( rc == GNTST_okay && gnttab_need_iommu_mapping(ld) ) + { + unsigned int wrc, rdc; + int err = 0; ++ ++ double_gt_lock(lgt, rgt); ++ + mapcount(lgt, rd, op->frame, &wrc, &rdc); + if ( (wrc + rdc) == 0 ) + err = iommu_unmap_page(ld, op->frame); + else if ( wrc == 0 ) + err = iommu_map_page(ld, op->frame, op->frame, IOMMUF_readable); ++ ++ double_gt_unlock(lgt, rgt); ++ + if ( err ) +- { + rc = GNTST_general_error; +- goto act_release_out; +- } + } + + /* If just unmapped a writable mapping, mark as dirtied */ +- if ( !(op->flags & GNTMAP_readonly) ) ++ if ( rc == GNTST_okay && !(op->flags & GNTMAP_readonly) ) + gnttab_mark_dirty(rd, op->frame); + +- act_release_out: +- active_entry_release(act); +- unmap_out: +- double_gt_unlock(lgt, rgt); +- + op->status = rc; + rcu_unlock_domain(rd); + } +@@ -1078,8 +1100,8 @@ __gnttab_unmap_common_complete(struct gn + + rcu_lock_domain(rd); + rgt = rd->grant_table; +- spin_lock(&rgt->lock); + ++ read_lock(&rgt->lock); + if ( rgt->gt_version == 0 ) + goto unlock_out; + +@@ -1145,7 +1167,7 @@ __gnttab_unmap_common_complete(struct gn + act_release_out: + active_entry_release(act); + unlock_out: +- spin_unlock(&rgt->lock); ++ read_unlock(&rgt->lock); + + if ( put_handle ) + { +@@ -1332,11 +1354,13 @@ gnttab_unpopulate_status_frames(struct d + gt->nr_status_frames = 0; + } + ++/* ++ * Grow the grant table. The caller must hold the grant table's ++ * write lock before calling this function. ++ */ + int + gnttab_grow_table(struct domain *d, unsigned int req_nr_frames) + { +- /* d's grant table lock must be held by the caller */ +- + struct grant_table *gt = d->grant_table; + unsigned int i, j; + +@@ -1442,7 +1466,7 @@ gnttab_setup_table( + } + + gt = d->grant_table; +- spin_lock(>->lock); ++ write_lock(>->lock); + + if ( gt->gt_version == 0 ) + gt->gt_version = 1; +@@ -1470,7 +1494,7 @@ gnttab_setup_table( + } + + out3: +- spin_unlock(>->lock); ++ write_unlock(>->lock); + out2: + rcu_unlock_domain(d); + out1: +@@ -1512,13 +1536,13 @@ gnttab_query_size( + goto query_out_unlock; + } + +- spin_lock(&d->grant_table->lock); ++ read_lock(&d->grant_table->lock); + + op.nr_frames = nr_grant_frames(d->grant_table); + op.max_nr_frames = max_grant_frames; + op.status = GNTST_okay; + +- spin_unlock(&d->grant_table->lock); ++ read_unlock(&d->grant_table->lock); + + + query_out_unlock: +@@ -1544,7 +1568,7 @@ gnttab_prepare_for_transfer( + union grant_combo scombo, prev_scombo, new_scombo; + int retries = 0; + +- spin_lock(&rgt->lock); ++ read_lock(&rgt->lock); + + if ( rgt->gt_version == 0 ) + { +@@ -1595,11 +1619,11 @@ gnttab_prepare_for_transfer( + scombo = prev_scombo; + } + +- spin_unlock(&rgt->lock); ++ read_unlock(&rgt->lock); + return 1; + + fail: +- spin_unlock(&rgt->lock); ++ read_unlock(&rgt->lock); + return 0; + } + +@@ -1614,6 +1638,7 @@ gnttab_transfer( + struct gnttab_transfer gop; + unsigned long mfn; + unsigned int max_bitsize; ++ struct active_grant_entry *act; + + for ( i = 0; i < count; i++ ) + { +@@ -1791,7 +1816,8 @@ gnttab_transfer( + TRACE_1D(TRC_MEM_PAGE_GRANT_TRANSFER, e->domain_id); + + /* Tell the guest about its new page frame. */ +- spin_lock(&e->grant_table->lock); ++ read_lock(&e->grant_table->lock); ++ act = active_entry_acquire(e->grant_table, gop.ref); + + if ( e->grant_table->gt_version == 1 ) + { +@@ -1809,7 +1835,8 @@ gnttab_transfer( + shared_entry_header(e->grant_table, gop.ref)->flags |= + GTF_transfer_completed; + +- spin_unlock(&e->grant_table->lock); ++ active_entry_release(act); ++ read_unlock(&e->grant_table->lock); + + rcu_unlock_domain(e); + +@@ -1847,7 +1874,7 @@ __release_grant_for_copy( + released_read = 0; + released_write = 0; + +- spin_lock(&rgt->lock); ++ read_lock(&rgt->lock); + + act = active_entry_acquire(rgt, gref); + sha = shared_entry_header(rgt, gref); +@@ -1889,7 +1916,7 @@ __release_grant_for_copy( + } + + active_entry_release(act); +- spin_unlock(&rgt->lock); ++ read_unlock(&rgt->lock); + + if ( td != rd ) + { +@@ -1947,7 +1974,7 @@ __acquire_grant_for_copy( + + *page = NULL; + +- spin_lock(&rgt->lock); ++ read_lock(&rgt->lock); + + if ( rgt->gt_version == 0 ) + PIN_FAIL(gt_unlock_out, GNTST_general_error, +@@ -2023,20 +2050,20 @@ __acquire_grant_for_copy( + * here and reacquire + */ + active_entry_release(act); +- spin_unlock(&rgt->lock); ++ read_unlock(&rgt->lock); + + rc = __acquire_grant_for_copy(td, trans_gref, rd->domain_id, + readonly, &grant_frame, page, + &trans_page_off, &trans_length, 0); + +- spin_lock(&rgt->lock); ++ read_lock(&rgt->lock); + act = active_entry_acquire(rgt, gref); + + if ( rc != GNTST_okay ) { + __fixup_status_for_copy_pin(act, status); + rcu_unlock_domain(td); + active_entry_release(act); +- spin_unlock(&rgt->lock); ++ read_unlock(&rgt->lock); + return rc; + } + +@@ -2049,7 +2076,7 @@ __acquire_grant_for_copy( + __fixup_status_for_copy_pin(act, status); + rcu_unlock_domain(td); + active_entry_release(act); +- spin_unlock(&rgt->lock); ++ read_unlock(&rgt->lock); + put_page(*page); + return __acquire_grant_for_copy(rd, gref, ldom, readonly, + frame, page, page_off, length, +@@ -2118,7 +2145,7 @@ __acquire_grant_for_copy( + *frame = act->frame; + + active_entry_release(act); +- spin_unlock(&rgt->lock); ++ read_unlock(&rgt->lock); + return rc; + + unlock_out_clear: +@@ -2133,7 +2160,7 @@ __acquire_grant_for_copy( + active_entry_release(act); + + gt_unlock_out: +- spin_unlock(&rgt->lock); ++ read_unlock(&rgt->lock); + + return rc; + } +@@ -2307,7 +2334,7 @@ gnttab_set_version(XEN_GUEST_HANDLE_PARA + if ( gt->gt_version == op.version ) + goto out; + +- spin_lock(>->lock); ++ write_lock(>->lock); + /* Make sure that the grant table isn't currently in use when we + change the version number, except for the first 8 entries which + are allowed to be in use (xenstore/xenconsole keeps them mapped). +@@ -2392,7 +2419,7 @@ gnttab_set_version(XEN_GUEST_HANDLE_PARA + gt->gt_version = op.version; + + out_unlock: +- spin_unlock(>->lock); ++ write_unlock(>->lock); + + out: + op.version = gt->gt_version; +@@ -2448,7 +2475,7 @@ gnttab_get_status_frames(XEN_GUEST_HANDL + + op.status = GNTST_okay; + +- spin_lock(>->lock); ++ read_lock(>->lock); + + for ( i = 0; i < op.nr_frames; i++ ) + { +@@ -2457,7 +2484,7 @@ gnttab_get_status_frames(XEN_GUEST_HANDL + op.status = GNTST_bad_virt_addr; + } + +- spin_unlock(>->lock); ++ read_unlock(>->lock); + out2: + rcu_unlock_domain(d); + out1: +@@ -2507,7 +2534,7 @@ __gnttab_swap_grant_ref(grant_ref_t ref_ + struct active_grant_entry *act_b = NULL; + s16 rc = GNTST_okay; + +- spin_lock(>->lock); ++ write_lock(>->lock); + + if ( gt->gt_version == 0 ) + PIN_FAIL(out, GNTST_general_error, "grant table not yet set up\n"); +@@ -2558,7 +2585,7 @@ out: + active_entry_release(act_b); + if ( act_a != NULL ) + active_entry_release(act_a); +- spin_unlock(>->lock); ++ write_unlock(>->lock); + + rcu_unlock_domain(d); + +@@ -2629,12 +2656,12 @@ static int __gnttab_cache_flush(gnttab_c + + if ( d != owner ) + { +- spin_lock(&owner->grant_table->lock); ++ read_lock(&owner->grant_table->lock); + + ret = grant_map_exists(d, owner->grant_table, mfn, ref_count); + if ( ret != 0 ) + { +- spin_unlock(&owner->grant_table->lock); ++ read_unlock(&owner->grant_table->lock); + rcu_unlock_domain(d); + put_page(page); + return ret; +@@ -2654,7 +2681,7 @@ static int __gnttab_cache_flush(gnttab_c + ret = 0; + + if ( d != owner ) +- spin_unlock(&owner->grant_table->lock); ++ read_unlock(&owner->grant_table->lock); + unmap_domain_page(v); + put_page(page); + +@@ -2873,7 +2900,7 @@ grant_table_create( + goto no_mem_0; + + /* Simple stuff. */ +- spin_lock_init(&t->lock); ++ rwlock_init(&t->lock); + spin_lock_init(&t->maptrack_lock); + t->nr_grant_frames = INITIAL_NR_GRANT_FRAMES; + +@@ -2983,7 +3010,7 @@ gnttab_release_mappings( + } + + rgt = rd->grant_table; +- spin_lock(&rgt->lock); ++ read_lock(&rgt->lock); + + act = active_entry_acquire(rgt, ref); + sha = shared_entry_header(rgt, ref); +@@ -3044,7 +3071,7 @@ gnttab_release_mappings( + gnttab_clear_flag(_GTF_reading, status); + + active_entry_release(act); +- spin_unlock(&rgt->lock); ++ read_unlock(&rgt->lock); + + rcu_unlock_domain(rd); + +@@ -3092,7 +3119,7 @@ static void gnttab_usage_print(struct do + printk(" -------- active -------- -------- shared --------\n"); + printk("[ref] localdom mfn pin localdom gmfn flags\n"); + +- spin_lock(>->lock); ++ read_lock(>->lock); + + if ( gt->gt_version == 0 ) + goto out; +@@ -3145,7 +3172,7 @@ static void gnttab_usage_print(struct do + } + + out: +- spin_unlock(>->lock); ++ read_unlock(>->lock); + + if ( first ) + printk("grant-table for remote domain:%5d ... " +--- sle12sp1.orig/xen/include/xen/grant_table.h 2015-07-08 13:49:46.000000000 +0200 ++++ sle12sp1/xen/include/xen/grant_table.h 2015-07-08 13:49:47.000000000 +0200 +@@ -64,6 +64,11 @@ struct grant_mapping { + + /* Per-domain grant information. */ + struct grant_table { ++ /* ++ * Lock protecting updates to grant table state (version, active ++ * entry list, etc.) ++ */ ++ rwlock_t lock; + /* Table size. Number of frames shared with guest */ + unsigned int nr_grant_frames; + /* Shared grant table (see include/public/grant_table.h). */ +@@ -84,8 +89,6 @@ struct grant_table { + unsigned int maptrack_limit; + /* Lock protecting the maptrack page list, head, and limit */ + spinlock_t maptrack_lock; +- /* Lock protecting updates to active and shared grant tables. */ +- spinlock_t lock; + /* The defined versions are 1 and 2. Set to 0 if we don't know + what version to use yet. */ + unsigned gt_version; +@@ -103,7 +106,7 @@ gnttab_release_mappings( + struct domain *d); + + /* Increase the size of a domain's grant table. +- * Caller must hold d's grant table lock. ++ * Caller must hold d's grant table write lock. + */ + int + gnttab_grow_table(struct domain *d, unsigned int req_nr_frames); diff --git a/557ffab8-evtchn-factor-out-freeing-an-event-channel.patch b/557ffab8-evtchn-factor-out-freeing-an-event-channel.patch new file mode 100644 index 0000000..74e0094 --- /dev/null +++ b/557ffab8-evtchn-factor-out-freeing-an-event-channel.patch @@ -0,0 +1,47 @@ +# Commit a622b5ade2bdf79ad95e6088a4041e75253c43f3 +# Date 2015-06-16 12:30:16 +0200 +# Author David Vrabel +# Committer Jan Beulich +evtchn: factor out freeing an event channel + +We're going to want to free an event channel from two places. Factor out +the code into a free_evtchn() function. + +Signed-off-by: David Vrabel + +--- sle12sp1.orig/xen/common/event_channel.c 2015-07-08 12:33:47.000000000 +0200 ++++ sle12sp1/xen/common/event_channel.c 2015-07-08 13:53:49.000000000 +0200 +@@ -194,6 +194,17 @@ static int get_free_port(struct domain * + return port; + } + ++static void free_evtchn(struct domain *d, struct evtchn *chn) ++{ ++ /* Clear pending event to avoid unexpected behavior on re-bind. */ ++ evtchn_port_clear_pending(d, chn); ++ ++ /* Reset binding to vcpu0 when the channel is freed. */ ++ chn->state = ECS_FREE; ++ chn->notify_vcpu_id = 0; ++ ++ xsm_evtchn_close_post(chn); ++} + + static long evtchn_alloc_unbound(evtchn_alloc_unbound_t *alloc) + { +@@ -571,14 +582,7 @@ static long __evtchn_close(struct domain + BUG(); + } + +- /* Clear pending event to avoid unexpected behavior on re-bind. */ +- evtchn_port_clear_pending(d1, chn1); +- +- /* Reset binding to vcpu0 when the channel is freed. */ +- chn1->state = ECS_FREE; +- chn1->notify_vcpu_id = 0; +- +- xsm_evtchn_close_post(chn1); ++ free_evtchn(d1, chn1); + + out: + if ( d2 != NULL ) diff --git a/5582bf43-evtchn-simplify-port_is_valid.patch b/5582bf43-evtchn-simplify-port_is_valid.patch new file mode 100644 index 0000000..588dc3e --- /dev/null +++ b/5582bf43-evtchn-simplify-port_is_valid.patch @@ -0,0 +1,69 @@ +# Commit 01280dc19cf3da089f98faf4f524b54b5a191df0 +# Date 2015-06-18 14:53:23 +0200 +# Author David Vrabel +# Committer Jan Beulich +evtchn: simplify port_is_valid() + +By keeping a count of the number of currently valid event channels, +port_is_valid() can be simplified. + +d->valid_evtchns is only increased (while holding d->event_lock), so +port_is_valid() may be safely called without taking the lock (this +will be useful later). + +Signed-off-by: David Vrabel + +Index: xen-4.5.1-testing/xen/common/event_channel.c +=================================================================== +--- xen-4.5.1-testing.orig/xen/common/event_channel.c ++++ xen-4.5.1-testing/xen/common/event_channel.c +@@ -191,6 +191,8 @@ static int get_free_port(struct domain * + return -ENOMEM; + bucket_from_port(d, port) = chn; + ++ write_atomic(&d->valid_evtchns, d->valid_evtchns + EVTCHNS_PER_BUCKET); ++ + return port; + } + +@@ -1267,6 +1269,7 @@ int evtchn_init(struct domain *d) + d->evtchn = alloc_evtchn_bucket(d, 0); + if ( !d->evtchn ) + return -ENOMEM; ++ d->valid_evtchns = EVTCHNS_PER_BUCKET; + + spin_lock_init(&d->event_lock); + if ( get_free_port(d) != 0 ) +Index: xen-4.5.1-testing/xen/include/xen/event.h +=================================================================== +--- xen-4.5.1-testing.orig/xen/include/xen/event.h ++++ xen-4.5.1-testing/xen/include/xen/event.h +@@ -90,11 +90,7 @@ static inline bool_t port_is_valid(struc + { + if ( p >= d->max_evtchns ) + return 0; +- if ( !d->evtchn ) +- return 0; +- if ( p < EVTCHNS_PER_BUCKET ) +- return 1; +- return group_from_port(d, p) != NULL && bucket_from_port(d, p) != NULL; ++ return p < read_atomic(&d->valid_evtchns); + } + + static inline struct evtchn *evtchn_from_port(struct domain *d, unsigned int p) +Index: xen-4.5.1-testing/xen/include/xen/sched.h +=================================================================== +--- xen-4.5.1-testing.orig/xen/include/xen/sched.h ++++ xen-4.5.1-testing/xen/include/xen/sched.h +@@ -335,8 +335,9 @@ struct domain + /* Event channel information. */ + struct evtchn *evtchn; /* first bucket only */ + struct evtchn **evtchn_group[NR_EVTCHN_GROUPS]; /* all other buckets */ +- unsigned int max_evtchns; +- unsigned int max_evtchn_port; ++ unsigned int max_evtchns; /* number supported by ABI */ ++ unsigned int max_evtchn_port; /* max permitted port number */ ++ unsigned int valid_evtchns; /* number of allocated event channels */ + spinlock_t event_lock; + const struct evtchn_port_ops *evtchn_port_ops; + struct evtchn_fifo_domain *evtchn_fifo; diff --git a/5582bf81-evtchn-remove-the-locking-when-unmasking-an-event-channel.patch b/5582bf81-evtchn-remove-the-locking-when-unmasking-an-event-channel.patch new file mode 100644 index 0000000..a6c35e0 --- /dev/null +++ b/5582bf81-evtchn-remove-the-locking-when-unmasking-an-event-channel.patch @@ -0,0 +1,32 @@ +# Commit e156654d4eb2fdeb524e6b40838767a5dc918966 +# Date 2015-06-18 14:54:25 +0200 +# Author David Vrabel +# Committer Jan Beulich +evtchn: remove the locking when unmasking an event channel + +The event channel lock is no longer required to check if the port is +valid. + +Signed-off-by: David Vrabel + +--- sle12sp1.orig/xen/common/event_channel.c 2015-07-08 13:53:50.000000000 +0200 ++++ sle12sp1/xen/common/event_channel.c 2015-07-08 13:54:42.000000000 +0200 +@@ -934,8 +934,6 @@ int evtchn_unmask(unsigned int port) + struct domain *d = current->domain; + struct evtchn *evtchn; + +- ASSERT(spin_is_locked(&d->event_lock)); +- + if ( unlikely(!port_is_valid(d, port)) ) + return -EINVAL; + +@@ -1102,9 +1100,7 @@ long do_event_channel_op(int cmd, XEN_GU + struct evtchn_unmask unmask; + if ( copy_from_guest(&unmask, arg, 1) != 0 ) + return -EFAULT; +- spin_lock(¤t->domain->event_lock); + rc = evtchn_unmask(unmask.port); +- spin_unlock(¤t->domain->event_lock); + break; + } + diff --git a/5583d9c5-x86-MSI-X-cleanup.patch b/5583d9c5-x86-MSI-X-cleanup.patch new file mode 100644 index 0000000..d3a2414 --- /dev/null +++ b/5583d9c5-x86-MSI-X-cleanup.patch @@ -0,0 +1,285 @@ +# Commit 236e13ce60e1c0eb0535ad258e74a3789bc0d074 +# Date 2015-06-19 10:58:45 +0200 +# Author Jan Beulich +# Committer Jan Beulich +x86/MSI-X: cleanup + +- __pci_enable_msix() now checks that an MSI-X capability was actually + found +- pass "pos" to msix_capability_init() as both callers already know it + (and hence there's no need to re-obtain it) +- call __pci_disable_msi{,x}() directly instead of via + pci_disable_msi() from __pci_enable_msi{x,}() state validation paths +- use msix_control_reg() instead of open coding it +- log message adjustments +- coding style corrections + +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper + +--- a/xen/arch/x86/msi.c ++++ b/xen/arch/x86/msi.c +@@ -35,6 +35,8 @@ + static s8 __read_mostly use_msi = -1; + boolean_param("msi", use_msi); + ++static void __pci_disable_msix(struct msi_desc *); ++ + /* bitmap indicate which fixed map is free */ + static DEFINE_SPINLOCK(msix_fixmap_lock); + static DECLARE_BITMAP(msix_fixmap_pages, FIX_MSIX_MAX_PAGES); +@@ -129,12 +131,14 @@ void msi_compose_msg(unsigned vector, const cpumask_t *cpu_mask, struct msi_msg + unsigned dest; + + memset(msg, 0, sizeof(*msg)); +- if ( !cpumask_intersects(cpu_mask, &cpu_online_map) ) { ++ if ( !cpumask_intersects(cpu_mask, &cpu_online_map) ) ++ { + dprintk(XENLOG_ERR,"%s, compose msi message error!!\n", __func__); + return; + } + +- if ( vector ) { ++ if ( vector ) ++ { + cpumask_t *mask = this_cpu(scratch_mask); + + cpumask_and(mask, cpu_mask, &cpu_online_map); +@@ -195,8 +199,7 @@ static void read_msi_msg(struct msi_desc *entry, struct msi_msg *msg) + } + case PCI_CAP_ID_MSIX: + { +- void __iomem *base; +- base = entry->mask_base; ++ void __iomem *base = entry->mask_base; + + msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); + msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); +@@ -257,8 +260,7 @@ static int write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) + } + case PCI_CAP_ID_MSIX: + { +- void __iomem *base; +- base = entry->mask_base; ++ void __iomem *base = entry->mask_base; + + writel(msg->address_lo, + base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); +@@ -281,7 +283,7 @@ void set_msi_affinity(struct irq_desc *desc, const cpumask_t *mask) + struct msi_desc *msi_desc = desc->msi_desc; + + dest = set_desc_affinity(desc, mask); +- if (dest == BAD_APICID || !msi_desc) ++ if ( dest == BAD_APICID || !msi_desc ) + return; + + ASSERT(spin_is_locked(&desc->lock)); +@@ -332,11 +334,11 @@ static void msix_set_enable(struct pci_dev *dev, int enable) + pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX); + if ( pos ) + { +- control = pci_conf_read16(seg, bus, slot, func, pos + PCI_MSIX_FLAGS); ++ control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos)); + control &= ~PCI_MSIX_FLAGS_ENABLE; + if ( enable ) + control |= PCI_MSIX_FLAGS_ENABLE; +- pci_conf_write16(seg, bus, slot, func, pos + PCI_MSIX_FLAGS, control); ++ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control); + } + } + +@@ -353,9 +355,11 @@ static void msi_set_mask_bit(struct irq_desc *desc, int flag) + + ASSERT(spin_is_locked(&desc->lock)); + BUG_ON(!entry || !entry->dev); +- switch (entry->msi_attrib.type) { ++ switch ( entry->msi_attrib.type ) ++ { + case PCI_CAP_ID_MSI: +- if (entry->msi_attrib.maskbit) { ++ if ( entry->msi_attrib.maskbit ) ++ { + u32 mask_bits; + u16 seg = entry->dev->seg; + u8 bus = entry->dev->bus; +@@ -701,13 +705,14 @@ static u64 read_pci_mem_bar(u16 seg, u8 bus, u8 slot, u8 func, u8 bir, int vf) + * requested MSI-X entries with allocated irqs or non-zero for otherwise. + **/ + static int msix_capability_init(struct pci_dev *dev, ++ unsigned int pos, + struct msi_info *msi, + struct msi_desc **desc, + unsigned int nr_entries) + { + struct arch_msix *msix = dev->msix; + struct msi_desc *entry = NULL; +- int pos, vf; ++ int vf; + u16 control; + u64 table_paddr; + u32 table_offset; +@@ -719,7 +724,6 @@ static int msix_capability_init(struct pci_dev *dev, + + ASSERT(spin_is_locked(&pcidevs_lock)); + +- pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX); + control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos)); + msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */ + +@@ -884,10 +888,9 @@ static int __pci_enable_msi(struct msi_info *msi, struct msi_desc **desc) + old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSI); + if ( old_desc ) + { +- dprintk(XENLOG_WARNING, "irq %d has already mapped to MSI on " +- "device %04x:%02x:%02x.%01x\n", +- msi->irq, msi->seg, msi->bus, +- PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn)); ++ printk(XENLOG_WARNING "irq %d already mapped to MSI on %04x:%02x:%02x.%u\n", ++ msi->irq, msi->seg, msi->bus, ++ PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn)); + *desc = old_desc; + return 0; + } +@@ -895,10 +898,10 @@ static int __pci_enable_msi(struct msi_info *msi, struct msi_desc **desc) + old_desc = find_msi_entry(pdev, -1, PCI_CAP_ID_MSIX); + if ( old_desc ) + { +- dprintk(XENLOG_WARNING, "MSI-X is already in use on " +- "device %04x:%02x:%02x.%01x\n", msi->seg, msi->bus, +- PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn)); +- pci_disable_msi(old_desc); ++ printk(XENLOG_WARNING "MSI-X already in use on %04x:%02x:%02x.%u\n", ++ msi->seg, msi->bus, ++ PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn)); ++ __pci_disable_msix(old_desc); + } + + return msi_capability_init(pdev, msi->irq, desc, msi->entry_nr); +@@ -912,7 +915,6 @@ static void __pci_disable_msi(struct msi_desc *entry) + msi_set_enable(dev, 0); + + BUG_ON(list_empty(&dev->msi_list)); +- + } + + /** +@@ -932,7 +934,7 @@ static void __pci_disable_msi(struct msi_desc *entry) + **/ + static int __pci_enable_msix(struct msi_info *msi, struct msi_desc **desc) + { +- int status, pos, nr_entries; ++ int pos, nr_entries; + struct pci_dev *pdev; + u16 control; + u8 slot = PCI_SLOT(msi->devfn); +@@ -941,23 +943,22 @@ static int __pci_enable_msix(struct msi_info *msi, struct msi_desc **desc) + + ASSERT(spin_is_locked(&pcidevs_lock)); + pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn); +- if ( !pdev ) ++ pos = pci_find_cap_offset(msi->seg, msi->bus, slot, func, PCI_CAP_ID_MSIX); ++ if ( !pdev || !pos ) + return -ENODEV; + +- pos = pci_find_cap_offset(msi->seg, msi->bus, slot, func, PCI_CAP_ID_MSIX); + control = pci_conf_read16(msi->seg, msi->bus, slot, func, + msix_control_reg(pos)); + nr_entries = multi_msix_capable(control); +- if (msi->entry_nr >= nr_entries) ++ if ( msi->entry_nr >= nr_entries ) + return -EINVAL; + + old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSIX); + if ( old_desc ) + { +- dprintk(XENLOG_WARNING, "irq %d has already mapped to MSIX on " +- "device %04x:%02x:%02x.%01x\n", +- msi->irq, msi->seg, msi->bus, +- PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn)); ++ printk(XENLOG_WARNING "irq %d already mapped to MSI-X on %04x:%02x:%02x.%u\n", ++ msi->irq, msi->seg, msi->bus, ++ PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn)); + *desc = old_desc; + return 0; + } +@@ -965,15 +966,13 @@ static int __pci_enable_msix(struct msi_info *msi, struct msi_desc **desc) + old_desc = find_msi_entry(pdev, -1, PCI_CAP_ID_MSI); + if ( old_desc ) + { +- dprintk(XENLOG_WARNING, "MSI is already in use on " +- "device %04x:%02x:%02x.%01x\n", msi->seg, msi->bus, +- PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn)); +- pci_disable_msi(old_desc); +- ++ printk(XENLOG_WARNING "MSI already in use on %04x:%02x:%02x.%u\n", ++ msi->seg, msi->bus, ++ PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn)); ++ __pci_disable_msi(old_desc); + } + +- status = msix_capability_init(pdev, msi, desc, nr_entries); +- return status; ++ return msix_capability_init(pdev, pos, msi, desc, nr_entries); + } + + static void _pci_cleanup_msix(struct arch_msix *msix) +@@ -991,19 +990,16 @@ static void _pci_cleanup_msix(struct arch_msix *msix) + + static void __pci_disable_msix(struct msi_desc *entry) + { +- struct pci_dev *dev; +- int pos; +- u16 control, seg; +- u8 bus, slot, func; +- +- dev = entry->dev; +- seg = dev->seg; +- bus = dev->bus; +- slot = PCI_SLOT(dev->devfn); +- func = PCI_FUNC(dev->devfn); ++ struct pci_dev *dev = entry->dev; ++ u16 seg = dev->seg; ++ u8 bus = dev->bus; ++ u8 slot = PCI_SLOT(dev->devfn); ++ u8 func = PCI_FUNC(dev->devfn); ++ unsigned int pos = pci_find_cap_offset(seg, bus, slot, func, ++ PCI_CAP_ID_MSIX); ++ u16 control = pci_conf_read16(seg, bus, slot, func, ++ msix_control_reg(entry->msi_attrib.pos)); + +- pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX); +- control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos)); + msix_set_enable(dev, 0); + + BUG_ON(list_empty(&dev->msi_list)); +@@ -1045,7 +1041,7 @@ int pci_prepare_msix(u16 seg, u8 bus, u8 devfn, bool_t off) + u16 control = pci_conf_read16(seg, bus, slot, func, + msix_control_reg(pos)); + +- rc = msix_capability_init(pdev, NULL, NULL, ++ rc = msix_capability_init(pdev, pos, NULL, NULL, + multi_msix_capable(control)); + } + spin_unlock(&pcidevs_lock); +@@ -1064,8 +1060,8 @@ int pci_enable_msi(struct msi_info *msi, struct msi_desc **desc) + if ( !use_msi ) + return -EPERM; + +- return msi->table_base ? __pci_enable_msix(msi, desc) : +- __pci_enable_msi(msi, desc); ++ return msi->table_base ? __pci_enable_msix(msi, desc) : ++ __pci_enable_msi(msi, desc); + } + + /* +@@ -1115,7 +1111,9 @@ int pci_restore_msi_state(struct pci_dev *pdev) + if ( !pdev ) + return -EINVAL; + +- ret = xsm_resource_setup_pci(XSM_PRIV, (pdev->seg << 16) | (pdev->bus << 8) | pdev->devfn); ++ ret = xsm_resource_setup_pci(XSM_PRIV, ++ (pdev->seg << 16) | (pdev->bus << 8) | ++ pdev->devfn); + if ( ret ) + return ret; + diff --git a/x86-MSI-X-guest-mask.patch b/5583da09-x86-MSI-track-host-and-guest-masking-separately.patch similarity index 73% rename from x86-MSI-X-guest-mask.patch rename to 5583da09-x86-MSI-track-host-and-guest-masking-separately.patch index 2b590ef..e855098 100644 --- a/x86-MSI-X-guest-mask.patch +++ b/5583da09-x86-MSI-track-host-and-guest-masking-separately.patch @@ -1,5 +1,7 @@ -References: bsc#907514 bsc#910258 bsc#918984 bsc#923967 - +# Commit ad28e42bd1d28d746988ed71654e8aa670629753 +# Date 2015-06-19 10:59:53 +0200 +# Author Jan Beulich +# Committer Jan Beulich x86/MSI: track host and guest masking separately In particular we want to avoid losing track of our own intention to @@ -9,8 +11,8 @@ host and guest requested so. Signed-off-by: Jan Beulich Reviewed-by: Andrew Cooper ---- trunk.orig/xen/arch/x86/hpet.c 2015-01-14 18:44:18.000000000 +0100 -+++ trunk/xen/arch/x86/hpet.c 2015-03-09 09:44:33.000000000 +0100 +--- sle12sp1.orig/xen/arch/x86/hpet.c 2015-01-14 18:44:18.000000000 +0100 ++++ sle12sp1/xen/arch/x86/hpet.c 2015-07-08 00:00:00.000000000 +0200 @@ -240,7 +240,7 @@ static void hpet_msi_unmask(struct irq_d cfg = hpet_read32(HPET_Tn_CFG(ch->idx)); cfg |= HPET_TN_ENABLE; @@ -29,9 +31,9 @@ Reviewed-by: Andrew Cooper } static int hpet_msi_write(struct hpet_event_channel *ch, struct msi_msg *msg) ---- trunk.orig/xen/arch/x86/hvm/vmsi.c 2015-01-14 18:44:18.000000000 +0100 -+++ trunk/xen/arch/x86/hvm/vmsi.c 2015-03-09 14:40:46.000000000 +0100 -@@ -216,7 +216,6 @@ static int msixtbl_read( +--- sle12sp1.orig/xen/arch/x86/hvm/vmsi.c 2015-07-08 00:00:00.000000000 +0200 ++++ sle12sp1/xen/arch/x86/hvm/vmsi.c 2015-07-08 00:00:00.000000000 +0200 +@@ -219,7 +219,6 @@ static int msixtbl_read( { unsigned long offset; struct msixtbl_entry *entry; @@ -39,9 +41,9 @@ Reviewed-by: Andrew Cooper unsigned int nr_entry, index; int r = X86EMUL_UNHANDLEABLE; -@@ -240,10 +239,16 @@ static int msixtbl_read( +@@ -253,13 +252,20 @@ static int msixtbl_read( } - else + if ( offset == PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) { - virt = msixtbl_addr_to_virt(entry, address); + const struct msi_desc *msi_desc; @@ -49,16 +51,21 @@ Reviewed-by: Andrew Cooper + if ( !virt ) goto out; -- *pval = readl(virt); + msi_desc = virt_to_msi_desc(entry->pdev, virt); + if ( !msi_desc ) + goto out; -+ *pval = MASK_INSR(msi_desc->msi_attrib.guest_masked, -+ PCI_MSIX_VECTOR_BITMASK); + if ( len == 4 ) +- *pval = readl(virt); ++ *pval = MASK_INSR(msi_desc->msi_attrib.guest_masked, ++ PCI_MSIX_VECTOR_BITMASK); + else +- *pval |= (u64)readl(virt) << 32; ++ *pval |= (u64)MASK_INSR(msi_desc->msi_attrib.guest_masked, ++ PCI_MSIX_VECTOR_BITMASK) << 32; } r = X86EMUL_OKAY; -@@ -261,7 +266,7 @@ static int msixtbl_write(struct vcpu *v, +@@ -277,7 +283,7 @@ static int msixtbl_write(struct vcpu *v, void *virt; unsigned int nr_entry, index; int r = X86EMUL_UNHANDLEABLE; @@ -66,8 +73,8 @@ Reviewed-by: Andrew Cooper + unsigned long flags; struct irq_desc *desc; - if ( len != 4 || (address & 3) ) -@@ -313,37 +318,7 @@ static int msixtbl_write(struct vcpu *v, + if ( (len != 4 && len != 8) || (address & (len - 1)) ) +@@ -337,37 +343,7 @@ static int msixtbl_write(struct vcpu *v, ASSERT(msi_desc == desc->msi_desc); @@ -106,77 +113,68 @@ Reviewed-by: Andrew Cooper unlock: spin_unlock_irqrestore(&desc->lock, flags); ---- trunk.orig/xen/arch/x86/msi.c 2015-05-18 11:39:36.000000000 +0200 -+++ trunk/xen/arch/x86/msi.c 2015-05-18 11:44:39.000000000 +0200 -@@ -388,12 +388,13 @@ int msi_maskable_irq(const struct msi_de +--- sle12sp1.orig/xen/arch/x86/msi.c 2015-07-08 00:00:00.000000000 +0200 ++++ sle12sp1/xen/arch/x86/msi.c 2015-07-08 00:00:00.000000000 +0200 +@@ -349,9 +349,10 @@ int msi_maskable_irq(const struct msi_de || entry->msi_attrib.maskbit; } --static bool_t msi_set_mask_bit(struct irq_desc *desc, int flag) -+static bool_t msi_set_mask_bit(struct irq_desc *desc, bool_t host, bool_t guest) +-static void msi_set_mask_bit(struct irq_desc *desc, int flag) ++static void msi_set_mask_bit(struct irq_desc *desc, bool_t host, bool_t guest) { struct msi_desc *entry = desc->msi_desc; - struct pci_dev *pdev; - u16 seg, control; - u8 bus, slot, func; + bool_t flag = host || guest; ASSERT(spin_is_locked(&desc->lock)); BUG_ON(!entry || !entry->dev); -@@ -449,7 +450,8 @@ static bool_t msi_set_mask_bit(struct ir - default: - return 0; +@@ -383,7 +384,8 @@ static void msi_set_mask_bit(struct irq_ + BUG(); + break; } - entry->msi_attrib.masked = !!flag; + entry->msi_attrib.host_masked = host; + entry->msi_attrib.guest_masked = guest; - - return 1; } -@@ -480,22 +482,39 @@ static int msi_get_mask_bit(const struct + + static int msi_get_mask_bit(const struct msi_desc *entry) +@@ -405,20 +407,33 @@ static int msi_get_mask_bit(const struct void mask_msi_irq(struct irq_desc *desc) { -- if ( unlikely(!msi_set_mask_bit(desc, 1)) ) -+ if ( unlikely(!msi_set_mask_bit(desc, 1, -+ desc->msi_desc->msi_attrib.guest_masked)) ) - BUG_ON(!(desc->status & IRQ_DISABLED)); +- msi_set_mask_bit(desc, 1); ++ msi_set_mask_bit(desc, 1, desc->msi_desc->msi_attrib.guest_masked); } void unmask_msi_irq(struct irq_desc *desc) { -- if ( unlikely(!msi_set_mask_bit(desc, 0)) ) -+ if ( unlikely(!msi_set_mask_bit(desc, 0, -+ desc->msi_desc->msi_attrib.guest_masked)) ) - WARN(); - } - +- msi_set_mask_bit(desc, 0); ++ msi_set_mask_bit(desc, 0, desc->msi_desc->msi_attrib.guest_masked); ++} ++ +void guest_mask_msi_irq(struct irq_desc *desc, bool_t mask) +{ + msi_set_mask_bit(desc, desc->msi_desc->msi_attrib.host_masked, mask); -+} -+ + } + static unsigned int startup_msi_irq(struct irq_desc *desc) { - unmask_msi_irq(desc); + bool_t guest_masked = (desc->status & IRQ_GUEST) && + is_hvm_domain(desc->msi_desc->dev->domain); + -+ if ( unlikely(!msi_set_mask_bit(desc, 0, guest_masked)) ) -+ WARN(); ++ msi_set_mask_bit(desc, 0, guest_masked); return 0; } +static void shutdown_msi_irq(struct irq_desc *desc) +{ -+ if ( unlikely(!msi_set_mask_bit(desc, 1, 1)) ) -+ BUG_ON(!(desc->status & IRQ_DISABLED)); ++ msi_set_mask_bit(desc, 1, 1); +} + void ack_nonmaskable_msi_irq(struct irq_desc *desc) { irq_complete_move(desc); -@@ -520,7 +539,7 @@ void end_nonmaskable_msi_irq(struct irq_ +@@ -443,7 +458,7 @@ void end_nonmaskable_msi_irq(struct irq_ static hw_irq_controller pci_msi_maskable = { .typename = "PCI-MSI/-X", .startup = startup_msi_irq, @@ -185,7 +183,7 @@ Reviewed-by: Andrew Cooper .enable = unmask_msi_irq, .disable = mask_msi_irq, .ack = ack_maskable_msi_irq, -@@ -690,7 +709,8 @@ static int msi_capability_init(struct pc +@@ -591,7 +606,8 @@ static int msi_capability_init(struct pc entry[i].msi_attrib.is_64 = is_64bit_address(control); entry[i].msi_attrib.entry_nr = i; entry[i].msi_attrib.maskbit = is_mask_bit_support(control); @@ -195,7 +193,7 @@ Reviewed-by: Andrew Cooper entry[i].msi_attrib.pos = pos; if ( entry[i].msi_attrib.maskbit ) entry[i].msi.mpos = mpos; -@@ -939,7 +959,8 @@ static int msix_capability_init(struct p +@@ -817,7 +833,8 @@ static int msix_capability_init(struct p entry->msi_attrib.is_64 = 1; entry->msi_attrib.entry_nr = msi->entry_nr; entry->msi_attrib.maskbit = 1; @@ -205,17 +203,17 @@ Reviewed-by: Andrew Cooper entry->msi_attrib.pos = pos; entry->irq = msi->irq; entry->dev = dev; -@@ -1309,7 +1330,8 @@ int pci_restore_msi_state(struct pci_dev +@@ -1152,7 +1169,8 @@ int pci_restore_msi_state(struct pci_dev + for ( i = 0; ; ) { - if ( unlikely(!msi_set_mask_bit(desc, -- entry[i].msi_attrib.masked)) ) -+ entry[i].msi_attrib.host_masked, -+ entry[i].msi_attrib.guest_masked)) ) - BUG(); +- msi_set_mask_bit(desc, entry[i].msi_attrib.masked); ++ msi_set_mask_bit(desc, entry[i].msi_attrib.host_masked, ++ entry[i].msi_attrib.guest_masked); if ( !--nr ) -@@ -1462,7 +1484,7 @@ static void dump_msi(unsigned char key) + break; +@@ -1304,7 +1322,7 @@ static void dump_msi(unsigned char key) else mask = '?'; printk(" %-6s%4u vec=%02x%7s%6s%3sassert%5s%7s" @@ -224,7 +222,7 @@ Reviewed-by: Andrew Cooper type, irq, (data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT, data & MSI_DATA_DELIVERY_LOWPRI ? "lowest" : "fixed", -@@ -1470,7 +1492,10 @@ static void dump_msi(unsigned char key) +@@ -1312,7 +1330,10 @@ static void dump_msi(unsigned char key) data & MSI_DATA_LEVEL_ASSERT ? "" : "de", addr & MSI_ADDR_DESTMODE_LOGIC ? "log" : "phys", addr & MSI_ADDR_REDIRECTION_LOWPRI ? "lowest" : "cpu", @@ -236,8 +234,8 @@ Reviewed-by: Andrew Cooper } } ---- trunk.orig/xen/drivers/passthrough/amd/iommu_init.c 2015-01-14 18:44:18.000000000 +0100 -+++ trunk/xen/drivers/passthrough/amd/iommu_init.c 2015-03-09 09:44:48.000000000 +0100 +--- sle12sp1.orig/xen/drivers/passthrough/amd/iommu_init.c 2015-01-14 18:44:18.000000000 +0100 ++++ sle12sp1/xen/drivers/passthrough/amd/iommu_init.c 2015-07-08 00:00:00.000000000 +0200 @@ -451,7 +451,7 @@ static void iommu_msi_unmask(struct irq_ spin_lock_irqsave(&iommu->lock, flags); amd_iommu_msi_enable(iommu, IOMMU_CONTROL_ENABLED); @@ -256,8 +254,8 @@ Reviewed-by: Andrew Cooper } static unsigned int iommu_msi_startup(struct irq_desc *desc) ---- trunk.orig/xen/drivers/passthrough/vtd/iommu.c 2015-05-19 23:16:48.000000000 +0200 -+++ trunk/xen/drivers/passthrough/vtd/iommu.c 2015-03-09 09:44:58.000000000 +0100 +--- sle12sp1.orig/xen/drivers/passthrough/vtd/iommu.c 2015-05-19 23:16:48.000000000 +0200 ++++ sle12sp1/xen/drivers/passthrough/vtd/iommu.c 2015-07-08 00:00:00.000000000 +0200 @@ -996,7 +996,7 @@ static void dma_msi_unmask(struct irq_de spin_lock_irqsave(&iommu->register_lock, flags); dmar_writel(iommu->reg, DMAR_FECTL_REG, 0); @@ -276,8 +274,8 @@ Reviewed-by: Andrew Cooper } static unsigned int dma_msi_startup(struct irq_desc *desc) ---- trunk.orig/xen/include/asm-x86/msi.h 2015-01-14 18:44:18.000000000 +0100 -+++ trunk/xen/include/asm-x86/msi.h 2015-03-09 09:42:49.000000000 +0100 +--- sle12sp1.orig/xen/include/asm-x86/msi.h 2015-01-14 18:44:18.000000000 +0100 ++++ sle12sp1/xen/include/asm-x86/msi.h 2015-07-08 00:00:00.000000000 +0200 @@ -90,12 +90,13 @@ extern unsigned int pci_msix_get_table_l struct msi_desc { diff --git a/5583da64-gnttab-use-per-VCPU-maptrack-free-lists.patch b/5583da64-gnttab-use-per-VCPU-maptrack-free-lists.patch new file mode 100644 index 0000000..7a71f42 --- /dev/null +++ b/5583da64-gnttab-use-per-VCPU-maptrack-free-lists.patch @@ -0,0 +1,284 @@ +# Commit dff515dfeac4c1c13422a128c558ac21ddc6c8db +# Date 2015-06-19 11:01:24 +0200 +# Author Malcolm Crossley +# Committer Jan Beulich +gnttab: use per-VCPU maptrack free lists + +Performance analysis of aggregate network throughput with many VMs +shows that performance is signficantly limited by contention on the +maptrack lock when obtaining/releasing maptrack handles from the free +list. + +Instead of a single free list use a per-VCPU list. This avoids any +contention when obtaining a handle. Handles must be released back to +their original list and since this may occur on a different VCPU there +is some contention on the destination VCPU's free list tail pointer +(but this is much better than a per-domain lock). + +Increase the default maximum number of maptrack frames by 4 times +because: a) struct grant_mapping is now 16 bytes (instead of 8); and +b) a guest may not evenly distribute all the grant map operations +across the VCPUs (meaning some VCPUs need more maptrack entries than +others). + +Signed-off-by: Malcolm Crossley +Signed-off-by: David Vrabel +Reviewed-by: Jan Beulich + +--- sle12sp1.orig/xen/common/domain.c 2015-07-08 00:00:00.000000000 +0200 ++++ sle12sp1/xen/common/domain.c 2015-07-08 13:52:23.000000000 +0200 +@@ -126,6 +126,8 @@ struct vcpu *alloc_vcpu( + + tasklet_init(&v->continue_hypercall_tasklet, NULL, 0); + ++ grant_table_init_vcpu(v); ++ + if ( !zalloc_cpumask_var(&v->cpu_hard_affinity) || + !zalloc_cpumask_var(&v->cpu_hard_affinity_tmp) || + !zalloc_cpumask_var(&v->cpu_hard_affinity_saved) || +--- sle12sp1.orig/xen/common/grant_table.c 2015-07-08 13:49:47.000000000 +0200 ++++ sle12sp1/xen/common/grant_table.c 2015-07-08 13:52:23.000000000 +0200 +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -57,7 +58,7 @@ integer_param("gnttab_max_frames", max_g + * New options allow to set max_maptrack_frames and + * map_grant_table_frames independently. + */ +-#define DEFAULT_MAX_MAPTRACK_FRAMES 256 ++#define DEFAULT_MAX_MAPTRACK_FRAMES 1024 + + static unsigned int __read_mostly max_maptrack_frames; + integer_param("gnttab_max_maptrack_frames", max_maptrack_frames); +@@ -279,62 +280,103 @@ double_gt_unlock(struct grant_table *lgt + + static inline int + __get_maptrack_handle( +- struct grant_table *t) ++ struct grant_table *t, ++ struct vcpu *v) + { +- unsigned int h; +- if ( unlikely((h = t->maptrack_head) == MAPTRACK_TAIL) ) ++ unsigned int head, next; ++ ++ /* No maptrack pages allocated for this VCPU yet? */ ++ head = v->maptrack_head; ++ if ( unlikely(head == MAPTRACK_TAIL) ) + return -1; +- t->maptrack_head = maptrack_entry(t, h).ref; +- return h; ++ ++ /* ++ * Always keep one entry in the free list to make it easier to add ++ * free entries to the tail. ++ */ ++ next = read_atomic(&maptrack_entry(t, head).ref); ++ if ( unlikely(next == MAPTRACK_TAIL) ) ++ return -1; ++ ++ v->maptrack_head = next; ++ ++ return head; + } + + static inline void + put_maptrack_handle( + struct grant_table *t, int handle) + { +- spin_lock(&t->maptrack_lock); +- maptrack_entry(t, handle).ref = t->maptrack_head; +- t->maptrack_head = handle; +- spin_unlock(&t->maptrack_lock); ++ struct domain *currd = current->domain; ++ struct vcpu *v; ++ unsigned int prev_tail, cur_tail; ++ ++ /* 1. Set entry to be a tail. */ ++ maptrack_entry(t, handle).ref = MAPTRACK_TAIL; ++ ++ /* 2. Add entry to the tail of the list on the original VCPU. */ ++ v = currd->vcpu[maptrack_entry(t, handle).vcpu]; ++ ++ cur_tail = read_atomic(&v->maptrack_tail); ++ do { ++ prev_tail = cur_tail; ++ cur_tail = cmpxchg(&v->maptrack_tail, prev_tail, handle); ++ } while ( cur_tail != prev_tail ); ++ ++ /* 3. Update the old tail entry to point to the new entry. */ ++ write_atomic(&maptrack_entry(t, prev_tail).ref, handle); + } + + static inline int + get_maptrack_handle( + struct grant_table *lgt) + { ++ struct vcpu *curr = current; + int i; + grant_handle_t handle; + struct grant_mapping *new_mt; +- unsigned int new_mt_limit, nr_frames; ++ ++ handle = __get_maptrack_handle(lgt, curr); ++ if ( likely(handle != -1) ) ++ return handle; + + spin_lock(&lgt->maptrack_lock); + +- while ( unlikely((handle = __get_maptrack_handle(lgt)) == -1) ) ++ if ( nr_maptrack_frames(lgt) >= max_maptrack_frames ) + { +- nr_frames = nr_maptrack_frames(lgt); +- if ( nr_frames >= max_maptrack_frames ) +- break; ++ spin_unlock(&lgt->maptrack_lock); ++ return -1; ++ } + +- new_mt = alloc_xenheap_page(); +- if ( !new_mt ) +- break; ++ new_mt = alloc_xenheap_page(); ++ if ( !new_mt ) ++ { ++ spin_unlock(&lgt->maptrack_lock); ++ return -1; ++ } ++ clear_page(new_mt); + +- clear_page(new_mt); ++ /* ++ * Use the first new entry and add the remaining entries to the ++ * head of the free list. ++ */ ++ handle = lgt->maptrack_limit; + +- new_mt_limit = lgt->maptrack_limit + MAPTRACK_PER_PAGE; ++ for ( i = 0; i < MAPTRACK_PER_PAGE; i++ ) ++ { ++ new_mt[i].ref = handle + i + 1; ++ new_mt[i].vcpu = curr->vcpu_id; ++ } ++ new_mt[i - 1].ref = curr->maptrack_head; + +- for ( i = 1; i < MAPTRACK_PER_PAGE; i++ ) +- new_mt[i - 1].ref = lgt->maptrack_limit + i; +- new_mt[i - 1].ref = lgt->maptrack_head; +- lgt->maptrack_head = lgt->maptrack_limit; ++ /* Set tail directly if this is the first page for this VCPU. */ ++ if ( curr->maptrack_tail == MAPTRACK_TAIL ) ++ curr->maptrack_tail = handle + MAPTRACK_PER_PAGE - 1; + +- lgt->maptrack[nr_frames] = new_mt; +- smp_wmb(); +- lgt->maptrack_limit = new_mt_limit; ++ curr->maptrack_head = handle + 1; + +- gdprintk(XENLOG_INFO, "Increased maptrack size to %u frames\n", +- nr_frames + 1); +- } ++ lgt->maptrack[nr_maptrack_frames(lgt)] = new_mt; ++ lgt->maptrack_limit += MAPTRACK_PER_PAGE; + + spin_unlock(&lgt->maptrack_lock); + +@@ -2919,16 +2961,9 @@ grant_table_create( + } + + /* Tracking of mapped foreign frames table */ +- if ( (t->maptrack = xzalloc_array(struct grant_mapping *, +- max_maptrack_frames)) == NULL ) ++ t->maptrack = vzalloc(max_maptrack_frames * sizeof(*t->maptrack)); ++ if ( t->maptrack == NULL ) + goto no_mem_2; +- if ( (t->maptrack[0] = alloc_xenheap_page()) == NULL ) +- goto no_mem_3; +- clear_page(t->maptrack[0]); +- t->maptrack_limit = MAPTRACK_PER_PAGE; +- for ( i = 1; i < MAPTRACK_PER_PAGE; i++ ) +- t->maptrack[0][i - 1].ref = i; +- t->maptrack[0][i - 1].ref = MAPTRACK_TAIL; + + /* Shared grant table. */ + if ( (t->shared_raw = xzalloc_array(void *, max_grant_frames)) == NULL ) +@@ -2960,8 +2995,7 @@ grant_table_create( + free_xenheap_page(t->shared_raw[i]); + xfree(t->shared_raw); + no_mem_3: +- free_xenheap_page(t->maptrack[0]); +- xfree(t->maptrack); ++ vfree(t->maptrack); + no_mem_2: + for ( i = 0; + i < num_act_frames_from_sha_frames(INITIAL_NR_GRANT_FRAMES); i++ ) +@@ -3096,7 +3130,7 @@ grant_table_destroy( + + for ( i = 0; i < nr_maptrack_frames(t); i++ ) + free_xenheap_page(t->maptrack[i]); +- xfree(t->maptrack); ++ vfree(t->maptrack); + + for ( i = 0; i < nr_active_grant_frames(t); i++ ) + free_xenheap_page(t->active[i]); +@@ -3110,6 +3144,12 @@ grant_table_destroy( + d->grant_table = NULL; + } + ++void grant_table_init_vcpu(struct vcpu *v) ++{ ++ v->maptrack_head = MAPTRACK_TAIL; ++ v->maptrack_tail = MAPTRACK_TAIL; ++} ++ + static void gnttab_usage_print(struct domain *rd) + { + int first = 1; +--- sle12sp1.orig/xen/include/xen/grant_table.h 2015-07-08 13:49:47.000000000 +0200 ++++ sle12sp1/xen/include/xen/grant_table.h 2015-07-08 13:52:23.000000000 +0200 +@@ -60,6 +60,8 @@ struct grant_mapping { + u32 ref; /* grant ref */ + u16 flags; /* 0-4: GNTMAP_* ; 5-15: unused */ + domid_t domid; /* granting domain */ ++ u32 vcpu; /* vcpu which created the grant mapping */ ++ u32 pad; /* round size to a power of 2 */ + }; + + /* Per-domain grant information. */ +@@ -83,9 +85,8 @@ struct grant_table { + grant_status_t **status; + /* Active grant table. */ + struct active_grant_entry **active; +- /* Mapping tracking table. */ ++ /* Mapping tracking table per vcpu. */ + struct grant_mapping **maptrack; +- unsigned int maptrack_head; + unsigned int maptrack_limit; + /* Lock protecting the maptrack page list, head, and limit */ + spinlock_t maptrack_lock; +@@ -99,6 +100,7 @@ int grant_table_create( + struct domain *d); + void grant_table_destroy( + struct domain *d); ++void grant_table_init_vcpu(struct vcpu *v); + + /* Domain death release of granted mappings of other domains' memory. */ + void +--- sle12sp1.orig/xen/include/xen/sched.h 2015-01-14 18:44:18.000000000 +0100 ++++ sle12sp1/xen/include/xen/sched.h 2015-07-08 13:52:23.000000000 +0200 +@@ -219,6 +219,10 @@ struct vcpu + /* VCPU paused by system controller. */ + int controller_pause_count; + ++ /* Maptrack */ ++ unsigned int maptrack_head; ++ unsigned int maptrack_tail; ++ + /* IRQ-safe virq_lock protects against delivering VIRQ to stale evtchn. */ + evtchn_port_t virq_to_evtchn[NR_VIRQS]; + spinlock_t virq_lock; diff --git a/5583da8c-gnttab-steal-maptrack-entries-from-other-VCPUs.patch b/5583da8c-gnttab-steal-maptrack-entries-from-other-VCPUs.patch new file mode 100644 index 0000000..c1cfa2c --- /dev/null +++ b/5583da8c-gnttab-steal-maptrack-entries-from-other-VCPUs.patch @@ -0,0 +1,153 @@ +# Commit e76ff6c156906b515c2a4300a81c95886ece5d5f +# Date 2015-06-19 11:02:04 +0200 +# Author David Vrabel +# Committer Jan Beulich +gnttab: steal maptrack entries from other VCPUs + +If a guest is not evenly grant mapping across its VCPUs one of the +VCPUs may run out of free maptrack entries even though other VCPUs +have many free. + +If this happens, "steal" free entries from other VCPUs. We want to +steal entries such that: + +a) We avoid ping-ponging stolen entries between VCPUs. + +b) The number of free entries owned by each VCPUs tends (over time) to + the number it uses. + +So when stealing, we select a VCPU at random (reducing (a)) and we +transfer the stolen entries to the thief VCPU (aiming for (b)). + +Signed-off-by: David Vrabel +Reviewed-by: Jan Beulich + +--- sle12sp1.orig/xen/common/grant_table.c 2015-07-08 13:52:23.000000000 +0200 ++++ sle12sp1/xen/common/grant_table.c 2015-07-08 13:52:31.000000000 +0200 +@@ -283,26 +283,70 @@ __get_maptrack_handle( + struct grant_table *t, + struct vcpu *v) + { +- unsigned int head, next; ++ unsigned int head, next, prev_head; + +- /* No maptrack pages allocated for this VCPU yet? */ +- head = v->maptrack_head; +- if ( unlikely(head == MAPTRACK_TAIL) ) +- return -1; +- +- /* +- * Always keep one entry in the free list to make it easier to add +- * free entries to the tail. +- */ +- next = read_atomic(&maptrack_entry(t, head).ref); +- if ( unlikely(next == MAPTRACK_TAIL) ) +- return -1; ++ do { ++ /* No maptrack pages allocated for this VCPU yet? */ ++ head = read_atomic(&v->maptrack_head); ++ if ( unlikely(head == MAPTRACK_TAIL) ) ++ return -1; + +- v->maptrack_head = next; ++ /* ++ * Always keep one entry in the free list to make it easier to ++ * add free entries to the tail. ++ */ ++ next = read_atomic(&maptrack_entry(t, head).ref); ++ if ( unlikely(next == MAPTRACK_TAIL) ) ++ return -1; ++ ++ prev_head = head; ++ head = cmpxchg(&v->maptrack_head, prev_head, next); ++ } while ( head != prev_head ); + + return head; + } + ++/* ++ * Try to "steal" a free maptrack entry from another VCPU. ++ * ++ * A stolen entry is transferred to the thief, so the number of ++ * entries for each VCPU should tend to the usage pattern. ++ * ++ * To avoid having to atomically count the number of free entries on ++ * each VCPU and to avoid two VCPU repeatedly stealing entries from ++ * each other, the initial victim VCPU is selected randomly. ++ */ ++static int steal_maptrack_handle(struct grant_table *t, ++ const struct vcpu *curr) ++{ ++ const struct domain *currd = curr->domain; ++ unsigned int first, i; ++ ++ /* Find an initial victim. */ ++ first = i = get_random() % currd->max_vcpus; ++ ++ do { ++ if ( currd->vcpu[i] ) ++ { ++ int handle; ++ ++ handle = __get_maptrack_handle(t, currd->vcpu[i]); ++ if ( handle != -1 ) ++ { ++ maptrack_entry(t, handle).vcpu = curr->vcpu_id; ++ return handle; ++ } ++ } ++ ++ i++; ++ if ( i == currd->max_vcpus ) ++ i = 0; ++ } while ( i != first ); ++ ++ /* No free handles on any VCPU. */ ++ return -1; ++} ++ + static inline void + put_maptrack_handle( + struct grant_table *t, int handle) +@@ -342,10 +386,31 @@ get_maptrack_handle( + + spin_lock(&lgt->maptrack_lock); + ++ /* ++ * If we've run out of frames, try stealing an entry from another ++ * VCPU (in case the guest isn't mapping across its VCPUs evenly). ++ */ + if ( nr_maptrack_frames(lgt) >= max_maptrack_frames ) + { ++ /* ++ * Can drop the lock since no other VCPU can be adding a new ++ * frame once they've run out. ++ */ + spin_unlock(&lgt->maptrack_lock); +- return -1; ++ ++ /* ++ * Uninitialized free list? Steal an extra entry for the tail ++ * sentinel. ++ */ ++ if ( curr->maptrack_tail == MAPTRACK_TAIL ) ++ { ++ handle = steal_maptrack_handle(lgt, curr); ++ if ( handle == -1 ) ++ return -1; ++ curr->maptrack_tail = handle; ++ write_atomic(&curr->maptrack_head, handle); ++ } ++ return steal_maptrack_handle(lgt, curr); + } + + new_mt = alloc_xenheap_page(); +@@ -373,7 +438,7 @@ get_maptrack_handle( + if ( curr->maptrack_tail == MAPTRACK_TAIL ) + curr->maptrack_tail = handle + MAPTRACK_PER_PAGE - 1; + +- curr->maptrack_head = handle + 1; ++ write_atomic(&curr->maptrack_head, handle + 1); + + lgt->maptrack[nr_maptrack_frames(lgt)] = new_mt; + lgt->maptrack_limit += MAPTRACK_PER_PAGE; diff --git a/5587d711-evtchn-clear-xen_consumer-when-clearing-state.patch b/5587d711-evtchn-clear-xen_consumer-when-clearing-state.patch new file mode 100644 index 0000000..eb64a09 --- /dev/null +++ b/5587d711-evtchn-clear-xen_consumer-when-clearing-state.patch @@ -0,0 +1,105 @@ +# Commit b399386bcdb9d458f5647476a06fe86f5968d87e +# Date 2015-06-22 11:36:17 +0200 +# Author David Vrabel +# Committer Jan Beulich +evtchn: clear xen_consumer when clearing state + +Freeing a xen event channel would clear xen_consumer before clearing +the channel state, leaving a window where the channel is in a funny +state (still bound but no consumer). + +Move the clear of xen_consumer into free_evtchn() where the state is +also cleared. + +Signed-off-by: David Vrabel + +Ditch the pointless evtchn_close() wrapper around __evtchn_close() +(renaming the latter) as well as some bogus casts of function results +to void. + +Signed-off-by: Jan Beulich + +--- sle12sp1.orig/xen/common/event_channel.c 2015-07-08 13:54:42.000000000 +0200 ++++ sle12sp1/xen/common/event_channel.c 2015-07-08 13:57:44.000000000 +0200 +@@ -204,6 +204,7 @@ static void free_evtchn(struct domain *d + /* Reset binding to vcpu0 when the channel is freed. */ + chn->state = ECS_FREE; + chn->notify_vcpu_id = 0; ++ chn->xen_consumer = 0; + + xsm_evtchn_close_post(chn); + } +@@ -470,7 +471,7 @@ static long evtchn_bind_pirq(evtchn_bind + } + + +-static long __evtchn_close(struct domain *d1, int port1) ++static long evtchn_close(struct domain *d1, int port1, bool_t guest) + { + struct domain *d2 = NULL; + struct vcpu *v; +@@ -490,7 +491,7 @@ static long __evtchn_close(struct domain + chn1 = evtchn_from_port(d1, port1); + + /* Guest cannot close a Xen-attached event channel. */ +- if ( unlikely(consumer_is_xen(chn1)) ) ++ if ( unlikely(consumer_is_xen(chn1)) && guest ) + { + rc = -EINVAL; + goto out; +@@ -599,12 +600,6 @@ static long __evtchn_close(struct domain + return rc; + } + +- +-static long evtchn_close(evtchn_close_t *close) +-{ +- return __evtchn_close(current->domain, close->port); +-} +- + int evtchn_send(struct domain *d, unsigned int lport) + { + struct evtchn *lchn, *rchn; +@@ -959,7 +954,7 @@ static long evtchn_reset(evtchn_reset_t + goto out; + + for ( i = 0; port_is_valid(d, i); i++ ) +- (void)__evtchn_close(d, i); ++ evtchn_close(d, i, 1); + + spin_lock(&d->event_lock); + +@@ -1066,7 +1061,7 @@ long do_event_channel_op(int cmd, XEN_GU + struct evtchn_close close; + if ( copy_from_guest(&close, arg, 1) != 0 ) + return -EFAULT; +- rc = evtchn_close(&close); ++ rc = evtchn_close(current->domain, close.port, 1); + break; + } + +@@ -1196,11 +1191,10 @@ void free_xen_event_channel( + BUG_ON(!port_is_valid(d, port)); + chn = evtchn_from_port(d, port); + BUG_ON(!consumer_is_xen(chn)); +- chn->xen_consumer = 0; + + spin_unlock(&d->event_lock); + +- (void)__evtchn_close(d, port); ++ evtchn_close(d, port, 0); + } + + +@@ -1299,10 +1293,7 @@ void evtchn_destroy(struct domain *d) + + /* Close all existing event channels. */ + for ( i = 0; port_is_valid(d, i); i++ ) +- { +- evtchn_from_port(d, i)->xen_consumer = 0; +- (void)__evtchn_close(d, i); +- } ++ evtchn_close(d, i, 0); + + /* Free all event-channel buckets. */ + spin_lock(&d->event_lock); diff --git a/5587d779-evtchn-defer-freeing-struct-evtchn-s-until-evtchn_destroy_final.patch b/5587d779-evtchn-defer-freeing-struct-evtchn-s-until-evtchn_destroy_final.patch new file mode 100644 index 0000000..a6e2bdc --- /dev/null +++ b/5587d779-evtchn-defer-freeing-struct-evtchn-s-until-evtchn_destroy_final.patch @@ -0,0 +1,110 @@ +# Commit a753f0e53ff973a8a066e86c1cb3d6dd5c68d59f +# Date 2015-06-22 11:38:01 +0200 +# Author David Vrabel +# Committer Jan Beulich +evtchn: defer freeing struct evtchn's until evtchn_destroy_final() + +notify_via_xen_event_channel() and free_xen_event_channel() had to +check if the domain was dying because they may be called while the +domain is being destroyed and the struct evtchn's are being freed. + +By deferring the freeing of the struct evtchn's until all references +to the domain are dropped, these functions can rely on the channel +state being present and valid. + +Signed-off-by: David Vrabel + +--- sle12sp1.orig/xen/common/event_channel.c 2015-07-08 13:57:44.000000000 +0200 ++++ sle12sp1/xen/common/event_channel.c 2015-07-08 14:00:53.000000000 +0200 +@@ -1177,22 +1177,8 @@ int alloc_unbound_xen_event_channel( + void free_xen_event_channel( + struct vcpu *local_vcpu, int port) + { +- struct evtchn *chn; + struct domain *d = local_vcpu->domain; +- +- spin_lock(&d->event_lock); +- +- if ( unlikely(d->is_dying) ) +- { +- spin_unlock(&d->event_lock); +- return; +- } +- + BUG_ON(!port_is_valid(d, port)); +- chn = evtchn_from_port(d, port); +- BUG_ON(!consumer_is_xen(chn)); +- +- spin_unlock(&d->event_lock); + + evtchn_close(d, port, 0); + } +@@ -1206,18 +1192,12 @@ void notify_via_xen_event_channel(struct + + spin_lock(&ld->event_lock); + +- if ( unlikely(ld->is_dying) ) +- { +- spin_unlock(&ld->event_lock); +- return; +- } +- + ASSERT(port_is_valid(ld, lport)); + lchn = evtchn_from_port(ld, lport); +- ASSERT(consumer_is_xen(lchn)); + + if ( likely(lchn->state == ECS_INTERDOMAIN) ) + { ++ ASSERT(consumer_is_xen(lchn)); + rd = lchn->u.interdomain.remote_dom; + rport = lchn->u.interdomain.remote_port; + rchn = evtchn_from_port(rd, rport); +@@ -1285,7 +1265,7 @@ int evtchn_init(struct domain *d) + + void evtchn_destroy(struct domain *d) + { +- unsigned int i, j; ++ unsigned int i; + + /* After this barrier no new event-channel allocations can occur. */ + BUG_ON(!d->is_dying); +@@ -1295,8 +1275,17 @@ void evtchn_destroy(struct domain *d) + for ( i = 0; port_is_valid(d, i); i++ ) + evtchn_close(d, i, 0); + ++ clear_global_virq_handlers(d); ++ ++ evtchn_fifo_destroy(d); ++} ++ ++ ++void evtchn_destroy_final(struct domain *d) ++{ ++ unsigned int i, j; ++ + /* Free all event-channel buckets. */ +- spin_lock(&d->event_lock); + for ( i = 0; i < NR_EVTCHN_GROUPS; i++ ) + { + if ( !d->evtchn_group[i] ) +@@ -1304,20 +1293,9 @@ void evtchn_destroy(struct domain *d) + for ( j = 0; j < BUCKETS_PER_GROUP; j++ ) + free_evtchn_bucket(d, d->evtchn_group[i][j]); + xfree(d->evtchn_group[i]); +- d->evtchn_group[i] = NULL; + } + free_evtchn_bucket(d, d->evtchn); +- d->evtchn = NULL; +- spin_unlock(&d->event_lock); + +- clear_global_virq_handlers(d); +- +- evtchn_fifo_destroy(d); +-} +- +- +-void evtchn_destroy_final(struct domain *d) +-{ + #if MAX_VIRT_CPUS > BITS_PER_LONG + xfree(d->poll_mask); + d->poll_mask = NULL; diff --git a/5587d7b7-evtchn-use-a-per-event-channel-lock-for-sending-events.patch b/5587d7b7-evtchn-use-a-per-event-channel-lock-for-sending-events.patch new file mode 100644 index 0000000..752ac90 --- /dev/null +++ b/5587d7b7-evtchn-use-a-per-event-channel-lock-for-sending-events.patch @@ -0,0 +1,257 @@ +# Commit de6acb78bf0e137cbe5b72cee4a35ca018d759cc +# Date 2015-06-22 11:39:03 +0200 +# Author David Vrabel +# Committer Jan Beulich +evtchn: use a per-event channel lock for sending events + +When sending an event, use a new per-event channel lock to safely +validate the event channel state. + +This new lock must be held when changing event channel state. Note +that the event channel lock must also be held when changing state from +ECS_FREE or it will race with a concurrent get_free_port() call. + +To avoid having to take the remote event channel locks when sending to +an interdomain event channel, the local and remote channel locks are +both held when binding or closing an interdomain event channel. + +This significantly increases the number of events that can be sent +from multiple VCPUs. But struct evtchn increases in size, reducing +the number that fit into a single page to 64 (instead of 128). + +Signed-off-by: David Vrabel +Reviewed-by: Jan Beulich + +--- sle12sp1.orig/xen/common/event_channel.c 2015-07-08 14:00:53.000000000 +0200 ++++ sle12sp1/xen/common/event_channel.c 2015-07-08 14:04:08.000000000 +0200 +@@ -141,6 +141,7 @@ static struct evtchn *alloc_evtchn_bucke + return NULL; + } + chn[i].port = port + i; ++ spin_lock_init(&chn[i].lock); + } + return chn; + } +@@ -231,11 +232,15 @@ static long evtchn_alloc_unbound(evtchn_ + if ( rc ) + goto out; + ++ spin_lock(&chn->lock); ++ + chn->state = ECS_UNBOUND; + if ( (chn->u.unbound.remote_domid = alloc->remote_dom) == DOMID_SELF ) + chn->u.unbound.remote_domid = current->domain->domain_id; + evtchn_port_init(d, chn); + ++ spin_unlock(&chn->lock); ++ + alloc->port = port; + + out: +@@ -246,6 +251,28 @@ static long evtchn_alloc_unbound(evtchn_ + } + + ++static void double_evtchn_lock(struct evtchn *lchn, struct evtchn *rchn) ++{ ++ if ( lchn < rchn ) ++ { ++ spin_lock(&lchn->lock); ++ spin_lock(&rchn->lock); ++ } ++ else ++ { ++ if ( lchn != rchn ) ++ spin_lock(&rchn->lock); ++ spin_lock(&lchn->lock); ++ } ++} ++ ++static void double_evtchn_unlock(struct evtchn *lchn, struct evtchn *rchn) ++{ ++ spin_unlock(&lchn->lock); ++ if ( lchn != rchn ) ++ spin_unlock(&rchn->lock); ++} ++ + static long evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind) + { + struct evtchn *lchn, *rchn; +@@ -288,6 +315,8 @@ static long evtchn_bind_interdomain(evtc + if ( rc ) + goto out; + ++ double_evtchn_lock(lchn, rchn); ++ + lchn->u.interdomain.remote_dom = rd; + lchn->u.interdomain.remote_port = rport; + lchn->state = ECS_INTERDOMAIN; +@@ -303,6 +332,8 @@ static long evtchn_bind_interdomain(evtc + */ + evtchn_set_pending(ld->vcpu[lchn->notify_vcpu_id], lport); + ++ double_evtchn_unlock(lchn, rchn); ++ + bind->local_port = lport; + + out: +@@ -343,11 +374,16 @@ static long evtchn_bind_virq(evtchn_bind + ERROR_EXIT(port); + + chn = evtchn_from_port(d, port); ++ ++ spin_lock(&chn->lock); ++ + chn->state = ECS_VIRQ; + chn->notify_vcpu_id = vcpu; + chn->u.virq = virq; + evtchn_port_init(d, chn); + ++ spin_unlock(&chn->lock); ++ + v->virq_to_evtchn[virq] = bind->port = port; + + out: +@@ -374,10 +410,15 @@ static long evtchn_bind_ipi(evtchn_bind_ + ERROR_EXIT(port); + + chn = evtchn_from_port(d, port); ++ ++ spin_lock(&chn->lock); ++ + chn->state = ECS_IPI; + chn->notify_vcpu_id = vcpu; + evtchn_port_init(d, chn); + ++ spin_unlock(&chn->lock); ++ + bind->port = port; + + out: +@@ -452,11 +493,15 @@ static long evtchn_bind_pirq(evtchn_bind + goto out; + } + ++ spin_lock(&chn->lock); ++ + chn->state = ECS_PIRQ; + chn->u.pirq.irq = pirq; + link_pirq_port(port, chn, v); + evtchn_port_init(d, chn); + ++ spin_unlock(&chn->lock); ++ + bind->port = port; + + #ifdef CONFIG_X86 +@@ -577,15 +622,24 @@ static long evtchn_close(struct domain * + BUG_ON(chn2->state != ECS_INTERDOMAIN); + BUG_ON(chn2->u.interdomain.remote_dom != d1); + ++ double_evtchn_lock(chn1, chn2); ++ ++ free_evtchn(d1, chn1); ++ + chn2->state = ECS_UNBOUND; + chn2->u.unbound.remote_domid = d1->domain_id; +- break; ++ ++ double_evtchn_unlock(chn1, chn2); ++ ++ goto out; + + default: + BUG(); + } + ++ spin_lock(&chn1->lock); + free_evtchn(d1, chn1); ++ spin_unlock(&chn1->lock); + + out: + if ( d2 != NULL ) +@@ -607,21 +661,18 @@ int evtchn_send(struct domain *d, unsign + struct vcpu *rvcpu; + int rport, ret = 0; + +- spin_lock(&ld->event_lock); +- +- if ( unlikely(!port_is_valid(ld, lport)) ) +- { +- spin_unlock(&ld->event_lock); ++ if ( !port_is_valid(ld, lport) ) + return -EINVAL; +- } + + lchn = evtchn_from_port(ld, lport); + ++ spin_lock(&lchn->lock); ++ + /* Guest cannot send via a Xen-attached event channel. */ + if ( unlikely(consumer_is_xen(lchn)) ) + { +- spin_unlock(&ld->event_lock); +- return -EINVAL; ++ ret = -EINVAL; ++ goto out; + } + + ret = xsm_evtchn_send(XSM_HOOK, ld, lchn); +@@ -651,7 +702,7 @@ int evtchn_send(struct domain *d, unsign + } + + out: +- spin_unlock(&ld->event_lock); ++ spin_unlock(&lchn->lock); + + return ret; + } +@@ -1162,11 +1213,15 @@ int alloc_unbound_xen_event_channel( + if ( rc ) + goto out; + ++ spin_lock(&chn->lock); ++ + chn->state = ECS_UNBOUND; + chn->xen_consumer = get_xen_consumer(notification_fn); + chn->notify_vcpu_id = local_vcpu->vcpu_id; + chn->u.unbound.remote_domid = remote_domid; + ++ spin_unlock(&chn->lock); ++ + out: + spin_unlock(&d->event_lock); + +@@ -1190,11 +1245,11 @@ void notify_via_xen_event_channel(struct + struct domain *rd; + int rport; + +- spin_lock(&ld->event_lock); +- + ASSERT(port_is_valid(ld, lport)); + lchn = evtchn_from_port(ld, lport); + ++ spin_lock(&lchn->lock); ++ + if ( likely(lchn->state == ECS_INTERDOMAIN) ) + { + ASSERT(consumer_is_xen(lchn)); +@@ -1204,7 +1259,7 @@ void notify_via_xen_event_channel(struct + evtchn_set_pending(rd->vcpu[rchn->notify_vcpu_id], rport); + } + +- spin_unlock(&ld->event_lock); ++ spin_unlock(&lchn->lock); + } + + void evtchn_check_pollers(struct domain *d, unsigned int port) +--- sle12sp1.orig/xen/include/xen/sched.h 2015-07-08 13:53:50.000000000 +0200 ++++ sle12sp1/xen/include/xen/sched.h 2015-07-08 14:04:08.000000000 +0200 +@@ -79,6 +79,7 @@ extern domid_t hardware_domid; + + struct evtchn + { ++ spinlock_t lock; + #define ECS_FREE 0 /* Channel is available for use. */ + #define ECS_RESERVED 1 /* Channel is reserved. */ + #define ECS_UNBOUND 2 /* Channel is waiting to bind to a remote domain. */ diff --git a/5587d7e2-evtchn-pad-struct-evtchn-to-64-bytes.patch b/5587d7e2-evtchn-pad-struct-evtchn-to-64-bytes.patch new file mode 100644 index 0000000..18e6ca8 --- /dev/null +++ b/5587d7e2-evtchn-pad-struct-evtchn-to-64-bytes.patch @@ -0,0 +1,27 @@ +# Commit b58214a24231a1f2a7e09ae9cc3014eff752918b +# Date 2015-06-22 11:39:46 +0200 +# Author David Vrabel +# Committer Jan Beulich +evtchn: pad struct evtchn to 64 bytes + +The number of struct evtchn in a page must be a power of two. Under +some workloads performance is improved slightly by padding struct +evtchn to 64 bytes (a typical cache line size), thus putting the fewer +per-channel locks into each cache line. + +This does not decrease the number of struct evtchn's per-page. + +Signed-off-by: David Vrabel +Acked-by: Jan Beulich + +--- sle12sp1.orig/xen/include/xen/sched.h 2015-07-08 14:04:08.000000000 +0200 ++++ sle12sp1/xen/include/xen/sched.h 2015-07-08 14:04:21.000000000 +0200 +@@ -129,7 +129,7 @@ struct evtchn + #endif + } ssid; + #endif +-}; ++} __attribute__((aligned(64))); + + int evtchn_init(struct domain *d); /* from domain_create */ + void evtchn_destroy(struct domain *d); /* from domain_kill */ diff --git a/558bfaa0-x86-traps-avoid-using-current-too-early.patch b/558bfaa0-x86-traps-avoid-using-current-too-early.patch new file mode 100644 index 0000000..1324723 --- /dev/null +++ b/558bfaa0-x86-traps-avoid-using-current-too-early.patch @@ -0,0 +1,23 @@ +# Commit 142473cfce41a565898e0fa33dc98a1f5e41abe4 +# Date 2015-06-25 14:57:04 +0200 +# Author Andrew Cooper +# Committer Jan Beulich +x86/traps: avoid using current too early on boot + +Early on boot, current has the sentinel value 0xfffff000. Blindly using it in +show_registers() causes a nested failure and no useful information printed +from an early crash. + +Signed-off-by: Andrew Cooper + +--- a/xen/arch/x86/x86_64/traps.c ++++ b/xen/arch/x86/x86_64/traps.c +@@ -84,7 +84,7 @@ void show_registers(const struct cpu_use + struct cpu_user_regs fault_regs = *regs; + unsigned long fault_crs[8]; + enum context context; +- struct vcpu *v = current; ++ struct vcpu *v = system_state >= SYS_STATE_smp_boot ? current : NULL; + + if ( guest_mode(regs) && has_hvm_container_vcpu(v) ) + { diff --git a/5592a116-nested-EPT-fix-the-handling-of-nested-EPT.patch b/5592a116-nested-EPT-fix-the-handling-of-nested-EPT.patch new file mode 100644 index 0000000..ee57a2d --- /dev/null +++ b/5592a116-nested-EPT-fix-the-handling-of-nested-EPT.patch @@ -0,0 +1,50 @@ +# Commit 71bb7304e7a7a35ea6df4b0cedebc35028e4c159 +# Date 2015-06-30 15:00:54 +0100 +# Author Liang Li +# Committer Ian Campbell +nested EPT: fix the handling of nested EPT + +If the host EPT entry is changed, the nested EPT should be updated. +the current code does not do this, and it's wrong. +I have tested this patch, the L2 guest can boot and run as normal. + +Signed-off-by: Liang Li +Signed-off-by: Yang Zhang +Reported-by: Tim Deegan +Reviewed-by: Tim Deegan + +--- a/xen/arch/x86/mm/p2m-ept.c ++++ b/xen/arch/x86/mm/p2m-ept.c +@@ -26,6 +26,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1040,6 +1041,9 @@ void ept_sync_domain(struct p2m_domain * + + ASSERT(local_irq_is_enabled()); + ++ if ( nestedhvm_enabled(d) && !p2m_is_nestedp2m(p2m) ) ++ p2m_flush_nestedp2m(d); ++ + /* + * Flush active cpus synchronously. Flush others the next time this domain + * is scheduled onto them. We accept the race of other CPUs adding to +--- a/xen/arch/x86/mm/p2m.c ++++ b/xen/arch/x86/mm/p2m.c +@@ -1713,6 +1713,12 @@ p2m_flush_table(struct p2m_domain *p2m) + ASSERT(page_list_empty(&p2m->pod.super)); + ASSERT(page_list_empty(&p2m->pod.single)); + ++ if ( p2m->np2m_base == P2M_BASE_EADDR ) ++ { ++ p2m_unlock(p2m); ++ return; ++ } ++ + /* This is no longer a valid nested p2m for any address space */ + p2m->np2m_base = P2M_BASE_EADDR; + diff --git a/559b9dd6-x86-p2m-ept-don-t-unmap-in-use-EPT-pagetable.patch b/559b9dd6-x86-p2m-ept-don-t-unmap-in-use-EPT-pagetable.patch new file mode 100644 index 0000000..df473a2 --- /dev/null +++ b/559b9dd6-x86-p2m-ept-don-t-unmap-in-use-EPT-pagetable.patch @@ -0,0 +1,64 @@ +# Commit e4e9d2d4e76bd8fe229c124bd57fc6ba824271b3 +# Date 2015-07-07 11:37:26 +0200 +# Author Andrew Cooper +# Committer Jan Beulich +x86/p2m-ept: don't unmap the EPT pagetable while it is still in use + +The call to iommu_pte_flush() between the two hunks uses &ept_entry->epte +which is a pointer into the mapped page. + +It is eventually passed to `clflush` instruction which will suffer a pagefault +if the virtual mapping has fallen out of the TLB. + + (XEN) ----[ Xen-4.5.0-xs102594-d x86_64 debug=y Not tainted ]---- + (XEN) CPU: 7 + (XEN) RIP: e008:[] cacheline_flush+0x4/0x9 + + (XEN) Xen call trace: + (XEN) [] cacheline_flush+0x4/0x9 + (XEN) [] __iommu_flush_cache+0x4a/0x6a + (XEN) [] iommu_pte_flush+0x2b/0xd5 + (XEN) [] ept_set_entry+0x4bc/0x61f + (XEN) [] p2m_set_entry+0xd1/0x112 + (XEN) [] clear_mmio_p2m_entry+0x1a0/0x200 + (XEN) [] unmap_mmio_regions+0x49/0x73 + (XEN) [] do_domctl+0x15bd/0x1edb + (XEN) [] syscall_enter+0xeb/0x145 + (XEN) + (XEN) Pagetable walk from ffff820040004ae0: + (XEN) L4[0x104] = 00000008668a5063 ffffffffffffffff + (XEN) L3[0x001] = 00000008668a3063 ffffffffffffffff + (XEN) L2[0x000] = 000000086689c063 ffffffffffffffff + (XEN) L1[0x004] = 000000056f078063 000000000007f678 + (XEN) + (XEN) **************************************** + (XEN) Panic on CPU 7: + (XEN) FATAL PAGE FAULT + (XEN) [error_code=0000] + (XEN) Faulting linear address: ffff820040004ae0 + (XEN) **************************************** + +Signed-off-by: Andrew Cooper +Reviewed-by: George Dunlap +Reviewed-by: Jan Beulich + +--- a/xen/arch/x86/mm/p2m-ept.c ++++ b/xen/arch/x86/mm/p2m-ept.c +@@ -764,8 +764,6 @@ ept_set_entry(struct p2m_domain *p2m, un + p2m->max_mapped_pfn = gfn + (1UL << order) - 1; + + out: +- unmap_domain_page(table); +- + if ( needs_sync != sync_off ) + ept_sync_domain(p2m); + +@@ -788,6 +786,8 @@ out: + } + } + ++ unmap_domain_page(table); ++ + /* Release the old intermediate tables, if any. This has to be the + last thing we do, after the ept_sync_domain() and removal + from the iommu tables, so as to avoid a potential diff --git a/559bdde5-pull-in-latest-linux-earlycpio.patch b/559bdde5-pull-in-latest-linux-earlycpio.patch new file mode 100644 index 0000000..8106811 --- /dev/null +++ b/559bdde5-pull-in-latest-linux-earlycpio.patch @@ -0,0 +1,102 @@ +# Commit 39c6664a0e6e1b4ed80660d545dff34ce41bee31 +# Date 2015-07-07 15:10:45 +0100 +# Author Ian Campbell +# Committer Ian Campbell +xen: earlycpio: Pull in latest linux earlycpio.[ch] + +AFAICT our current version does not correspond to any version in the +Linux history. This commit resynchronised to the state in Linux +commit 598bae70c2a8e35c8d39b610cca2b32afcf047af. + +Differences from upstream: find_cpio_data is __init, printk instead of +pr_*. + +This appears to fix Debian bug #785187. "Appears" because my test box +happens to be AMD and the issue is that the (valid) cpio generated by +the Intel ucode is not liked by the old Xen code. I've tested by +hacking the hypervisor to look for the Intel path. + +Reported-by: Stephan Seitz +Signed-off-by: Ian Campbell +Cc: Konrad Rzeszutek Wilk +Cc: Jan Beulich +Cc: Stephan Seitz +Cc: 785187@bugs.debian.org +Acked-by: Jan Beulich + +--- a/xen/common/earlycpio.c ++++ b/xen/common/earlycpio.c +@@ -54,25 +54,26 @@ enum cpio_fields { + + /** + * cpio_data find_cpio_data - Search for files in an uncompressed cpio +- * @path: The directory to search for, including a slash at the end +- * @data: Pointer to the the cpio archive or a header inside +- * @len: Remaining length of the cpio based on data pointer +- * @offset: When a matching file is found, this is the offset to the +- * beginning of the cpio. It can be used to iterate through +- * the cpio to find all files inside of a directory path ++ * @path: The directory to search for, including a slash at the end ++ * @data: Pointer to the the cpio archive or a header inside ++ * @len: Remaining length of the cpio based on data pointer ++ * @nextoff: When a matching file is found, this is the offset from the ++ * beginning of the cpio to the beginning of the next file, not the ++ * matching file itself. It can be used to iterate through the cpio ++ * to find all files inside of a directory path. + * +- * @return: struct cpio_data containing the address, length and +- * filename (with the directory path cut off) of the found file. +- * If you search for a filename and not for files in a directory, +- * pass the absolute path of the filename in the cpio and make sure +- * the match returned an empty filename string. ++ * @return: struct cpio_data containing the address, length and ++ * filename (with the directory path cut off) of the found file. ++ * If you search for a filename and not for files in a directory, ++ * pass the absolute path of the filename in the cpio and make sure ++ * the match returned an empty filename string. + */ + + struct cpio_data __init find_cpio_data(const char *path, void *data, +- size_t len, long *offset) ++ size_t len, long *nextoff) + { + const size_t cpio_header_len = 8*C_NFIELDS - 2; +- struct cpio_data cd = { NULL, 0 }; ++ struct cpio_data cd = { NULL, 0, "" }; + const char *p, *dptr, *nptr; + unsigned int ch[C_NFIELDS], *chp, v; + unsigned char c, x; +@@ -129,17 +130,17 @@ struct cpio_data __init find_cpio_data(c + if ((ch[C_MODE] & 0170000) == 0100000 && + ch[C_NAMESIZE] >= mypathsize && + !memcmp(p, path, mypathsize)) { +- *offset = (long)nptr - (long)data; ++ *nextoff = (long)nptr - (long)data; + if (ch[C_NAMESIZE] - mypathsize >= MAX_CPIO_FILE_NAME) { + printk( + "File %s exceeding MAX_CPIO_FILE_NAME [%d]\n", + p, MAX_CPIO_FILE_NAME); + } +- if (ch[C_NAMESIZE] - 1 /* includes \0 */ == mypathsize) { +- cd.data = (void *)dptr; +- cd.size = ch[C_FILESIZE]; +- return cd; /* Found it! */ +- } ++ strlcpy(cd.name, p + mypathsize, MAX_CPIO_FILE_NAME); ++ ++ cd.data = (void *)dptr; ++ cd.size = ch[C_FILESIZE]; ++ return cd; /* Found it! */ + } + len -= (nptr - p); + p = nptr; +--- a/xen/include/xen/earlycpio.h ++++ b/xen/include/xen/earlycpio.h +@@ -6,6 +6,7 @@ + struct cpio_data { + void *data; + size_t size; ++ char name[MAX_CPIO_FILE_NAME]; + }; + + struct cpio_data find_cpio_data(const char *path, void *data, size_t len, diff --git a/CVE-2015-3259-xsa137.patch b/CVE-2015-3259-xsa137.patch new file mode 100644 index 0000000..354a972 --- /dev/null +++ b/CVE-2015-3259-xsa137.patch @@ -0,0 +1,216 @@ +xl: Sane handling of extra config file arguments + +Various xl sub-commands take additional parameters containing = as +additional config fragments. + +The handling of these config fragments has a number of bugs: + + 1. Use of a static 1024-byte buffer. (If truncation would occur, + with semi-trusted input, a security risk arises due to quotes + being lost.) + + 2. Mishandling of the return value from snprintf, so that if + truncation occurs, the to-write pointer is updated with the + wanted-to-write length, resulting in stack corruption. (This is + XSA-137.) + + 3. Clone-and-hack of the code for constructing the appended + config file. + +These are fixed here, by introducing a new function +`string_realloc_append' and using it everywhere. The `extra_info' +buffers are replaced by pointers, which start off NULL and are +explicitly freed on all return paths. + +The separate variable which will become dom_info.extra_config is +abolished (which involves moving the clearing of dom_info). + +Additional bugs I observe, not fixed here: + + 4. The functions which now call string_realloc_append use ad-hoc + error returns, with multiple calls to `return'. This currently + necessitates multiple new calls to `free'. + + 5. Many of the paths in xl call exit(-rc) where rc is a libxl status + code. This is a ridiculous exit status `convention'. + + 6. The loops for handling extra config data are clone-and-hacks. + + 7. Once the extra config buffer is accumulated, it must be combined + with the appropriate main config file. The code to do this + combining is clone-and-hacked too. + +Signed-off-by: Ian Jackson +Tested-by: Ian Jackson +Acked-by: Ian Campbell + +--- a/tools/libxl/xl_cmdimpl.c ++++ b/tools/libxl/xl_cmdimpl.c +@@ -151,7 +151,7 @@ struct domain_create { + int console_autoconnect; + int checkpointed_stream; + const char *config_file; +- const char *extra_config; /* extra config string */ ++ char *extra_config; /* extra config string */ + const char *restore_file; + int migrate_fd; /* -1 means none */ + char **migration_domname_r; /* from malloc */ +@@ -4572,11 +4572,25 @@ int main_vm_list(int argc, char **argv) + return 0; + } + ++static void string_realloc_append(char **accumulate, const char *more) ++{ ++ /* Appends more to accumulate. Accumulate is either NULL, or ++ * points (always) to a malloc'd nul-terminated string. */ ++ ++ size_t oldlen = *accumulate ? strlen(*accumulate) : 0; ++ size_t morelen = strlen(more) + 1/*nul*/; ++ if (oldlen > SSIZE_MAX || morelen > SSIZE_MAX - oldlen) { ++ fprintf(stderr,"Additional config data far too large\n"); ++ exit(-ERROR_FAIL); ++ } ++ ++ *accumulate = xrealloc(*accumulate, oldlen + morelen); ++ memcpy(*accumulate + oldlen, more, morelen); ++} ++ + int main_create(int argc, char **argv) + { + const char *filename = NULL; +- char *p; +- char extra_config[1024]; + struct domain_create dom_info; + int paused = 0, debug = 0, daemonize = 1, console_autoconnect = 0, + quiet = 0, monitor = 1, vnc = 0, vncautopass = 0; +@@ -4591,6 +4605,8 @@ int main_create(int argc, char **argv) + {0, 0, 0, 0} + }; + ++ dom_info.extra_config = NULL; ++ + if (argv[1] && argv[1][0] != '-' && !strchr(argv[1], '=')) { + filename = argv[1]; + argc--; argv++; +@@ -4630,20 +4646,21 @@ int main_create(int argc, char **argv) + break; + } + +- extra_config[0] = '\0'; +- for (p = extra_config; optind < argc; optind++) { ++ memset(&dom_info, 0, sizeof(dom_info)); ++ ++ for (; optind < argc; optind++) { + if (strchr(argv[optind], '=') != NULL) { +- p += snprintf(p, sizeof(extra_config) - (p - extra_config), +- "%s\n", argv[optind]); ++ string_realloc_append(&dom_info.extra_config, argv[optind]); ++ string_realloc_append(&dom_info.extra_config, "\n"); + } else if (!filename) { + filename = argv[optind]; + } else { + help("create"); ++ free(dom_info.extra_config); + return 2; + } + } + +- memset(&dom_info, 0, sizeof(dom_info)); + dom_info.debug = debug; + dom_info.daemonize = daemonize; + dom_info.monitor = monitor; +@@ -4651,16 +4668,18 @@ int main_create(int argc, char **argv) + dom_info.dryrun = dryrun_only; + dom_info.quiet = quiet; + dom_info.config_file = filename; +- dom_info.extra_config = extra_config; + dom_info.migrate_fd = -1; + dom_info.vnc = vnc; + dom_info.vncautopass = vncautopass; + dom_info.console_autoconnect = console_autoconnect; + + rc = create_domain(&dom_info); +- if (rc < 0) ++ if (rc < 0) { ++ free(dom_info.extra_config); + return -rc; ++ } + ++ free(dom_info.extra_config); + return 0; + } + +@@ -4668,8 +4687,7 @@ int main_config_update(int argc, char ** + { + uint32_t domid; + const char *filename = NULL; +- char *p; +- char extra_config[1024]; ++ char *extra_config = NULL; + void *config_data = 0; + int config_len = 0; + libxl_domain_config d_config; +@@ -4707,15 +4725,15 @@ int main_config_update(int argc, char ** + break; + } + +- extra_config[0] = '\0'; +- for (p = extra_config; optind < argc; optind++) { ++ for (; optind < argc; optind++) { + if (strchr(argv[optind], '=') != NULL) { +- p += snprintf(p, sizeof(extra_config) - (p - extra_config), +- "%s\n", argv[optind]); ++ string_realloc_append(&extra_config, argv[optind]); ++ string_realloc_append(&extra_config, "\n"); + } else if (!filename) { + filename = argv[optind]; + } else { + help("create"); ++ free(extra_config); + return 2; + } + } +@@ -4724,7 +4742,8 @@ int main_config_update(int argc, char ** + rc = libxl_read_file_contents(ctx, filename, + &config_data, &config_len); + if (rc) { fprintf(stderr, "Failed to read config file: %s: %s\n", +- filename, strerror(errno)); return ERROR_FAIL; } ++ filename, strerror(errno)); ++ free(extra_config); return ERROR_FAIL; } + if (strlen(extra_config)) { + if (config_len > INT_MAX - (strlen(extra_config) + 2 + 1)) { + fprintf(stderr, "Failed to attach extra configration\n"); +@@ -4765,7 +4784,7 @@ int main_config_update(int argc, char ** + libxl_domain_config_dispose(&d_config); + + free(config_data); +- ++ free(extra_config); + return 0; + } + +@@ -7022,7 +7041,7 @@ int main_cpupoolcreate(int argc, char ** + { + const char *filename = NULL, *config_src=NULL; + const char *p; +- char extra_config[1024]; ++ char *extra_config = NULL; + int opt; + static struct option opts[] = { + {"defconfig", 1, 0, 'f'}, +@@ -7056,13 +7075,10 @@ int main_cpupoolcreate(int argc, char ** + break; + } + +- memset(extra_config, 0, sizeof(extra_config)); + while (optind < argc) { + if ((p = strchr(argv[optind], '='))) { +- if (strlen(extra_config) + 1 + strlen(argv[optind]) < sizeof(extra_config)) { +- strcat(extra_config, "\n"); +- strcat(extra_config, argv[optind]); +- } ++ string_realloc_append(&extra_config, "\n"); ++ string_realloc_append(&extra_config, argv[optind]); + } else if (!filename) { + filename = argv[optind]; + } else { diff --git a/libxl.pvscsi.patch b/libxl.pvscsi.patch index 250683e..d0cdd51 100644 --- a/libxl.pvscsi.patch +++ b/libxl.pvscsi.patch @@ -31,10 +31,8 @@ ee2e7e5 Merge pull request #1 from aaannz/pvscsi 7de6f49 support character devices too c84381b allow /dev/sda as scsi devspec f11e3a2 pvscsi -Index: xen-4.5.1-testing/docs/man/xl.cfg.pod.5 -=================================================================== ---- xen-4.5.1-testing.orig/docs/man/xl.cfg.pod.5 -+++ xen-4.5.1-testing/docs/man/xl.cfg.pod.5 +--- a/docs/man/xl.cfg.pod.5 ++++ b/docs/man/xl.cfg.pod.5 @@ -448,6 +448,36 @@ value is optional if this is a guest dom =back @@ -72,10 +70,8 @@ Index: xen-4.5.1-testing/docs/man/xl.cfg.pod.5 =item B Specifies the paravirtual framebuffer devices which should be supplied -Index: xen-4.5.1-testing/docs/man/xl.pod.1 -=================================================================== ---- xen-4.5.1-testing.orig/docs/man/xl.pod.1 -+++ xen-4.5.1-testing/docs/man/xl.pod.1 +--- a/docs/man/xl.pod.1 ++++ b/docs/man/xl.pod.1 @@ -1323,6 +1323,26 @@ List virtual trusted platform modules fo =back @@ -103,10 +99,8 @@ Index: xen-4.5.1-testing/docs/man/xl.pod.1 =head1 PCI PASS-THROUGH =over 4 -Index: xen-4.5.1-testing/tools/libxl/libxl.c -=================================================================== ---- xen-4.5.1-testing.orig/tools/libxl/libxl.c -+++ xen-4.5.1-testing/tools/libxl/libxl.c +--- a/tools/libxl/libxl.c ++++ b/tools/libxl/libxl.c @@ -2310,6 +2310,273 @@ int libxl_devid_to_device_vtpm(libxl_ctx return rc; } @@ -440,10 +434,8 @@ Index: xen-4.5.1-testing/tools/libxl/libxl.c /* * Local variables: * mode: C -Index: xen-4.5.1-testing/tools/libxl/libxl.h -=================================================================== ---- xen-4.5.1-testing.orig/tools/libxl/libxl.h -+++ xen-4.5.1-testing/tools/libxl/libxl.h +--- a/tools/libxl/libxl.h ++++ b/tools/libxl/libxl.h @@ -1238,6 +1238,26 @@ libxl_device_vtpm *libxl_device_vtpm_lis int libxl_device_vtpm_getinfo(libxl_ctx *ctx, uint32_t domid, libxl_device_vtpm *vtpm, libxl_vtpminfo *vtpminfo); @@ -499,10 +491,8 @@ Index: xen-4.5.1-testing/tools/libxl/libxl.h #endif /* LIBXL_H */ /* -Index: xen-4.5.1-testing/tools/libxl/libxl_create.c -=================================================================== ---- xen-4.5.1-testing.orig/tools/libxl/libxl_create.c -+++ xen-4.5.1-testing/tools/libxl/libxl_create.c +--- a/tools/libxl/libxl_create.c ++++ b/tools/libxl/libxl_create.c @@ -1141,6 +1141,7 @@ static void domcreate_rebuild_done(libxl libxl__multidev_begin(ao, &dcs->multidev); dcs->multidev.callback = domcreate_launch_dm; @@ -511,10 +501,8 @@ Index: xen-4.5.1-testing/tools/libxl/libxl_create.c libxl__multidev_prepared(egc, &dcs->multidev, 0); return; -Index: xen-4.5.1-testing/tools/libxl/libxl_device.c -=================================================================== ---- xen-4.5.1-testing.orig/tools/libxl/libxl_device.c -+++ xen-4.5.1-testing/tools/libxl/libxl_device.c +--- a/tools/libxl/libxl_device.c ++++ b/tools/libxl/libxl_device.c @@ -541,6 +541,7 @@ void libxl__multidev_prepared(libxl__egc * The following functions are defined: * libxl__add_disks @@ -556,10 +544,8 @@ Index: xen-4.5.1-testing/tools/libxl/libxl_device.c /******************************************************************************/ int libxl__device_destroy(libxl__gc *gc, libxl__device *dev) -Index: xen-4.5.1-testing/tools/libxl/libxl_internal.h -=================================================================== ---- xen-4.5.1-testing.orig/tools/libxl/libxl_internal.h -+++ xen-4.5.1-testing/tools/libxl/libxl_internal.h +--- a/tools/libxl/libxl_internal.h ++++ b/tools/libxl/libxl_internal.h @@ -1079,6 +1079,7 @@ _hidden int libxl__device_disk_setdefaul _hidden int libxl__device_nic_setdefault(libxl__gc *gc, libxl_device_nic *nic, uint32_t domid); @@ -590,10 +576,8 @@ Index: xen-4.5.1-testing/tools/libxl/libxl_internal.h /*----- device model creation -----*/ /* First layer; wraps libxl__spawn_spawn. */ -Index: xen-4.5.1-testing/tools/libxl/libxl_types.idl -=================================================================== ---- xen-4.5.1-testing.orig/tools/libxl/libxl_types.idl -+++ xen-4.5.1-testing/tools/libxl/libxl_types.idl +--- a/tools/libxl/libxl_types.idl ++++ b/tools/libxl/libxl_types.idl @@ -540,6 +540,26 @@ libxl_device_channel = Struct("device_ch ])), ]) @@ -659,10 +643,8 @@ Index: xen-4.5.1-testing/tools/libxl/libxl_types.idl libxl_vcpuinfo = Struct("vcpuinfo", [ ("vcpuid", uint32), ("cpu", uint32), -Index: xen-4.5.1-testing/tools/libxl/libxl_types_internal.idl -=================================================================== ---- xen-4.5.1-testing.orig/tools/libxl/libxl_types_internal.idl -+++ xen-4.5.1-testing/tools/libxl/libxl_types_internal.idl +--- a/tools/libxl/libxl_types_internal.idl ++++ b/tools/libxl/libxl_types_internal.idl @@ -22,6 +22,7 @@ libxl__device_kind = Enumeration("device (6, "VKBD"), (7, "CONSOLE"), @@ -671,10 +653,8 @@ Index: xen-4.5.1-testing/tools/libxl/libxl_types_internal.idl ]) libxl__console_backend = Enumeration("console_backend", [ -Index: xen-4.5.1-testing/tools/libxl/xl.h -=================================================================== ---- xen-4.5.1-testing.orig/tools/libxl/xl.h -+++ xen-4.5.1-testing/tools/libxl/xl.h +--- a/tools/libxl/xl.h ++++ b/tools/libxl/xl.h @@ -83,6 +83,9 @@ int main_channellist(int argc, char **ar int main_blockattach(int argc, char **argv); int main_blocklist(int argc, char **argv); @@ -685,10 +665,8 @@ Index: xen-4.5.1-testing/tools/libxl/xl.h int main_vtpmattach(int argc, char **argv); int main_vtpmlist(int argc, char **argv); int main_vtpmdetach(int argc, char **argv); -Index: xen-4.5.1-testing/tools/libxl/xl_cmdimpl.c -=================================================================== ---- xen-4.5.1-testing.orig/tools/libxl/xl_cmdimpl.c -+++ xen-4.5.1-testing/tools/libxl/xl_cmdimpl.c +--- a/tools/libxl/xl_cmdimpl.c ++++ b/tools/libxl/xl_cmdimpl.c @@ -17,6 +17,7 @@ #include "libxl_osdeps.h" @@ -904,7 +882,7 @@ Index: xen-4.5.1-testing/tools/libxl/xl_cmdimpl.c if (!xlu_cfg_get_list(config, "vtpm", &vtpms, 0, 0)) { d_config->num_vtpms = 0; d_config->vtpms = NULL; -@@ -6492,6 +6670,256 @@ int main_blockdetach(int argc, char **ar +@@ -6511,6 +6689,256 @@ int main_blockdetach(int argc, char **ar return rc; } @@ -1161,10 +1139,8 @@ Index: xen-4.5.1-testing/tools/libxl/xl_cmdimpl.c int main_vtpmattach(int argc, char **argv) { int opt; -Index: xen-4.5.1-testing/tools/libxl/xl_cmdtable.c -=================================================================== ---- xen-4.5.1-testing.orig/tools/libxl/xl_cmdtable.c -+++ xen-4.5.1-testing/tools/libxl/xl_cmdtable.c +--- a/tools/libxl/xl_cmdtable.c ++++ b/tools/libxl/xl_cmdtable.c @@ -372,6 +372,21 @@ struct cmd_spec cmd_table[] = { "Destroy a domain's virtual block device", " ", diff --git a/qemu-MSI-X-enable-maskall.patch b/qemu-MSI-X-enable-maskall.patch deleted file mode 100644 index 4700fec..0000000 --- a/qemu-MSI-X-enable-maskall.patch +++ /dev/null @@ -1,333 +0,0 @@ -References: bsc#907514 bsc#910258 bsc#918984 bsc#923967 - -xen/MSI-X: drive maskall and enable bits through hypercalls - -Particularly the maskall bit has to be under exclusive hypervisor -control (and since they live in the same config space field, the -enable bit has to follow suit). Use the replacement hypercall -interfaces. - -Signed-off-by: Jan Beulich - -Index: xen-4.5.1-testing/tools/qemu-xen-traditional-dir-remote/hw/pass-through.c -=================================================================== ---- xen-4.5.1-testing.orig/tools/qemu-xen-traditional-dir-remote/hw/pass-through.c -+++ xen-4.5.1-testing/tools/qemu-xen-traditional-dir-remote/hw/pass-through.c -@@ -814,9 +814,12 @@ static struct pt_reg_info_tbl pt_emu_reg - .offset = PCI_MSI_FLAGS, // 2 - .size = 2, - .init_val = 0x0000, -- .res_mask = 0x3800, -- .ro_mask = 0x07FF, -- .emu_mask = 0x0000, -+ /* This must not be split into res_mask (0x3800) and ro_mask (0x07FF) -+ * because even in permissive mode there must not be any write back -+ * to this register. -+ */ -+ .ro_mask = 0x3FFF, -+ .emu_mask = 0xC000, - .init = pt_msixctrl_reg_init, - .u.w.read = pt_word_reg_read, - .u.w.write = pt_msixctrl_reg_write, -@@ -4135,30 +4138,52 @@ static int pt_msixctrl_reg_write(struct - uint16_t *value, uint16_t dev_value, uint16_t valid_mask) - { - struct pt_reg_info_tbl *reg = cfg_entry->reg; -- uint16_t writable_mask = 0; -+ uint16_t writable_mask, val; - uint16_t throughable_mask = get_throughable_mask(ptdev, reg, valid_mask); - uint16_t old_ctrl = cfg_entry->data; - - /* modify emulate register */ - writable_mask = reg->emu_mask & ~reg->ro_mask & valid_mask; -- cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask); -+ val = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask); -+ cfg_entry->data = val; - - /* create value for writing to I/O device register */ - *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask); - - /* update MSI-X */ -- if ((*value & PCI_MSIX_ENABLE) && !(*value & PCI_MSIX_MASK)) -+ if ((val & PCI_MSIX_ENABLE) && !(val & PCI_MSIX_MASK)) - { - if (ptdev->msi_trans_en) { - PT_LOG("guest enabling MSI-X, disable MSI-INTx translation\n"); - pt_disable_msi_translate(ptdev); - } -+ if (!ptdev->msix->enabled) { -+ if (!ptdev->msix->maskall) -+ pt_msix_maskall(ptdev, 1); -+ pt_msix_enable(ptdev); -+ } - pt_msix_update(ptdev); -- } else if (!(*value & PCI_MSIX_ENABLE) && ptdev->msix->enabled) { -- pt_msix_disable(ptdev); -+ ptdev->msix->enabled = 1; -+ ptdev->msix->maskall = 0; -+ pt_msix_maskall(ptdev, 0); -+ } else if (ptdev->msix->enabled) { -+ if (!(val & PCI_MSIX_ENABLE)) { -+ pt_msix_disable(ptdev); -+ ptdev->msix->enabled = 0; -+ } else if (!ptdev->msix->maskall) { -+ ptdev->msix->maskall = 1; -+ pt_msix_maskall(ptdev, 1); -+ } - } - -- ptdev->msix->enabled = !!(*value & PCI_MSIX_ENABLE); -+ dev_value = pci_read_word(ptdev->pci_dev, ptdev->msix->ctrl_offset); -+ -+ if (ptdev->msix->enabled && !(dev_value & PCI_MSIX_ENABLE)) -+ PT_ERR("MSI-X unexpectedly disabled\n"); -+ else if ((dev_value & PCI_MSIX_ENABLE) && -+ ptdev->msix->maskall && -+ !(dev_value & PCI_MSIX_MASK)) -+ PT_ERR("MSI-X unexpectedly unmasked\n"); - - return 0; - } -Index: xen-4.5.1-testing/tools/qemu-xen-traditional-dir-remote/hw/pass-through.h -=================================================================== ---- xen-4.5.1-testing.orig/tools/qemu-xen-traditional-dir-remote/hw/pass-through.h -+++ xen-4.5.1-testing/tools/qemu-xen-traditional-dir-remote/hw/pass-through.h -@@ -206,6 +206,7 @@ struct msix_entry_info { - struct pt_msix_info { - uint32_t ctrl_offset; - int enabled; -+ int maskall; - int total_entries; - int bar_index; - uint64_t table_base; -Index: xen-4.5.1-testing/tools/qemu-xen-traditional-dir-remote/hw/pt-msi.c -=================================================================== ---- xen-4.5.1-testing.orig/tools/qemu-xen-traditional-dir-remote/hw/pt-msi.c -+++ xen-4.5.1-testing/tools/qemu-xen-traditional-dir-remote/hw/pt-msi.c -@@ -41,20 +41,12 @@ void msi_set_enable(struct pt_dev *dev, - - static void msix_set_enable(struct pt_dev *dev, int en) - { -- uint16_t val = 0; -- uint32_t address = 0; - if (!dev->msix) - return; - -- address = dev->msix->ctrl_offset; -- if (!address) -- return; -- -- val = pci_read_word(dev->pci_dev, address); -- val &= ~PCI_MSIX_ENABLE; -- if (en) -- val |= PCI_MSIX_ENABLE; -- pci_write_word(dev->pci_dev, address, val); -+ xc_physdev_msix_enable(xc_handle, dev->pci_dev->domain, dev->pci_dev->bus, -+ PCI_DEVFN(dev->pci_dev->dev, dev->pci_dev->func), -+ en); - } - - /* MSI virtuailization functions */ -@@ -349,6 +341,11 @@ int pt_msix_update(struct pt_dev *dev) - return 0; - } - -+void pt_msix_enable(struct pt_dev *dev) -+{ -+ msix_set_enable(dev, 1); -+} -+ - void pt_msix_disable(struct pt_dev *dev) - { - PCIDevice *d = &dev->dev; -@@ -394,6 +391,15 @@ void pt_msix_disable(struct pt_dev *dev) - } - } - -+int pt_msix_maskall(struct pt_dev *dev, int mask) -+{ -+ return xc_physdev_msix_mask_all(xc_handle, dev->pci_dev->domain, -+ dev->pci_dev->bus, -+ PCI_DEVFN(dev->pci_dev->dev, -+ dev->pci_dev->func), -+ mask); -+} -+ - int pt_msix_update_remap(struct pt_dev *dev, int bar_index) - { - struct msix_entry_info *entry; -Index: xen-4.5.1-testing/tools/qemu-xen-traditional-dir-remote/hw/pt-msi.h -=================================================================== ---- xen-4.5.1-testing.orig/tools/qemu-xen-traditional-dir-remote/hw/pt-msi.h -+++ xen-4.5.1-testing/tools/qemu-xen-traditional-dir-remote/hw/pt-msi.h -@@ -106,9 +106,15 @@ int - pt_msix_update(struct pt_dev *dev); - - void -+pt_msix_enable(struct pt_dev *dev); -+ -+void - pt_msix_disable(struct pt_dev *dev); - - int -+pt_msix_maskall(struct pt_dev *dev, int mask); -+ -+int - has_msix_mapping(struct pt_dev *dev, int bar_index); - - int -Index: xen-4.5.1-testing/tools/qemu-xen-dir-remote/hw/xen/xen_pt.h -=================================================================== ---- xen-4.5.1-testing.orig/tools/qemu-xen-dir-remote/hw/xen/xen_pt.h -+++ xen-4.5.1-testing/tools/qemu-xen-dir-remote/hw/xen/xen_pt.h -@@ -181,6 +181,7 @@ typedef struct XenPTMSIXEntry { - typedef struct XenPTMSIX { - uint32_t ctrl_offset; - bool enabled; -+ bool maskall; - int total_entries; - int bar_index; - uint64_t table_base; -@@ -293,7 +294,9 @@ int xen_pt_msix_init(XenPCIPassthroughSt - void xen_pt_msix_delete(XenPCIPassthroughState *s); - int xen_pt_msix_update(XenPCIPassthroughState *s); - int xen_pt_msix_update_remap(XenPCIPassthroughState *s, int bar_index); -+void xen_pt_msix_enable(XenPCIPassthroughState *s); - void xen_pt_msix_disable(XenPCIPassthroughState *s); -+int xen_pt_msix_maskall(XenPCIPassthroughState *s, bool mask); - - static inline bool xen_pt_has_msix_mapping(XenPCIPassthroughState *s, int bar) - { -Index: xen-4.5.1-testing/tools/qemu-xen-dir-remote/hw/xen/xen_pt_config_init.c -=================================================================== ---- xen-4.5.1-testing.orig/tools/qemu-xen-dir-remote/hw/xen/xen_pt_config_init.c -+++ xen-4.5.1-testing/tools/qemu-xen-dir-remote/hw/xen/xen_pt_config_init.c -@@ -1436,32 +1436,58 @@ static int xen_pt_msixctrl_reg_write(Xen - uint16_t dev_value, uint16_t valid_mask) - { - XenPTRegInfo *reg = cfg_entry->reg; -- uint16_t writable_mask = 0; -+ uint16_t writable_mask, value; - uint16_t throughable_mask = get_throughable_mask(s, reg, valid_mask); - int debug_msix_enabled_old; - - /* modify emulate register */ - writable_mask = reg->emu_mask & ~reg->ro_mask & valid_mask; -- cfg_entry->data = XEN_PT_MERGE_VALUE(*val, cfg_entry->data, writable_mask); -+ value = XEN_PT_MERGE_VALUE(*val, cfg_entry->data, writable_mask); -+ cfg_entry->data = value; - - /* create value for writing to I/O device register */ - *val = XEN_PT_MERGE_VALUE(*val, dev_value, throughable_mask); - -+ debug_msix_enabled_old = s->msix->enabled; -+ - /* update MSI-X */ -- if ((*val & PCI_MSIX_FLAGS_ENABLE) -- && !(*val & PCI_MSIX_FLAGS_MASKALL)) { -+ if ((value & PCI_MSIX_FLAGS_ENABLE) -+ && !(value & PCI_MSIX_FLAGS_MASKALL)) { -+ if (!s->msix->enabled) { -+ if (!s->msix->maskall) { -+ xen_pt_msix_maskall(s, true); -+ } -+ xen_pt_msix_enable(s); -+ } - xen_pt_msix_update(s); -- } else if (!(*val & PCI_MSIX_FLAGS_ENABLE) && s->msix->enabled) { -- xen_pt_msix_disable(s); -+ s->msix->enabled = true; -+ s->msix->maskall = false; -+ xen_pt_msix_maskall(s, false); -+ } else if (s->msix->enabled) { -+ if (!(value & PCI_MSIX_FLAGS_ENABLE)) { -+ xen_pt_msix_disable(s); -+ s->msix->enabled = false; -+ } else if (!s->msix->maskall) { -+ s->msix->maskall = true; -+ xen_pt_msix_maskall(s, true); -+ } - } - -- debug_msix_enabled_old = s->msix->enabled; -- s->msix->enabled = !!(*val & PCI_MSIX_FLAGS_ENABLE); - if (s->msix->enabled != debug_msix_enabled_old) { - XEN_PT_LOG(&s->dev, "%s MSI-X\n", - s->msix->enabled ? "enable" : "disable"); - } - -+ xen_host_pci_get_word(&s->real_device, s->msix->ctrl_offset, &dev_value); -+ -+ if (s->msix->enabled && !(dev_value & PCI_MSIX_FLAGS_ENABLE)) { -+ XEN_PT_ERR(&s->dev, "MSI-X unexpectedly disabled\n"); -+ } else if ((dev_value & PCI_MSIX_FLAGS_ENABLE) && -+ s->msix->maskall && -+ !(dev_value & PCI_MSIX_FLAGS_MASKALL)) { -+ XEN_PT_ERR(&s->dev, "MSI-X unexpectedly unmasked\n"); -+ } -+ - return 0; - } - -@@ -1483,9 +1509,12 @@ static XenPTRegInfo xen_pt_emu_reg_msix[ - .offset = PCI_MSI_FLAGS, - .size = 2, - .init_val = 0x0000, -- .res_mask = 0x3800, -- .ro_mask = 0x07FF, -- .emu_mask = 0x0000, -+ /* This must not be split into res_mask (0x3800) and ro_mask (0x07FF) -+ * because even in permissive mode there must not be any write back -+ * to this register. -+ */ -+ .ro_mask = 0x3FFF, -+ .emu_mask = 0xC000, - .init = xen_pt_msixctrl_reg_init, - .u.w.read = xen_pt_word_reg_read, - .u.w.write = xen_pt_msixctrl_reg_write, -Index: xen-4.5.1-testing/tools/qemu-xen-dir-remote/hw/xen/xen_pt_msi.c -=================================================================== ---- xen-4.5.1-testing.orig/tools/qemu-xen-dir-remote/hw/xen/xen_pt_msi.c -+++ xen-4.5.1-testing/tools/qemu-xen-dir-remote/hw/xen/xen_pt_msi.c -@@ -301,8 +301,11 @@ static int msix_set_enable(XenPCIPassthr - return -1; - } - -- return msi_msix_enable(s, s->msix->ctrl_offset, PCI_MSIX_FLAGS_ENABLE, -- enabled); -+ return xc_physdev_msix_enable(xen_xc, s->real_device.domain, -+ s->real_device.bus, -+ PCI_DEVFN(s->real_device.dev, -+ s->real_device.func), -+ enabled); - } - - static int xen_pt_msix_update_one(XenPCIPassthroughState *s, int entry_nr) -@@ -361,6 +364,11 @@ int xen_pt_msix_update(XenPCIPassthrough - return 0; - } - -+void xen_pt_msix_enable(XenPCIPassthroughState *s) -+{ -+ msix_set_enable(s, true); -+} -+ - void xen_pt_msix_disable(XenPCIPassthroughState *s) - { - int i = 0; -@@ -378,6 +386,15 @@ void xen_pt_msix_disable(XenPCIPassthrou - } - } - -+int xen_pt_msix_maskall(XenPCIPassthroughState *s, bool mask) -+{ -+ return xc_physdev_msix_mask_all(xen_xc, s->real_device.domain, -+ s->real_device.bus, -+ PCI_DEVFN(s->real_device.dev, -+ s->real_device.func), -+ mask); -+} -+ - int xen_pt_msix_update_remap(XenPCIPassthroughState *s, int bar_index) - { - XenPTMSIXEntry *entry; diff --git a/qemu-MSI-X-latch-writes.patch b/qemu-MSI-X-latch-writes.patch deleted file mode 100644 index 5c9fd0a..0000000 --- a/qemu-MSI-X-latch-writes.patch +++ /dev/null @@ -1,141 +0,0 @@ -References: bsc#931627 - -xen/MSI-X: latch MSI-X table writes - -The remaining log message in pci_msix_write() is wrong, as there guest -behavior may only appear to be wrong: For one, the old logic didn't -take the mask-all bit into account. And then this shouldn't depend on -host device state (i.e. the host may have masked the entry without the -guest having done so). Plus these writes shouldn't be dropped even when -an entry gets unmasked. Instead, if they can't be made take effect -right away, they should take effect on the next unmasking or enabling -operation - the specification explicitly describes such caching -behavior. - -Signed-off-by: Jan Beulich - ---- trunk.orig/tools/qemu-xen-dir-remote/hw/xen/xen_pt.h 2015-06-10 00:00:00.000000000 +0200 -+++ trunk/tools/qemu-xen-dir-remote/hw/xen/xen_pt.h 2015-04-14 08:58:43.000000000 +0200 -@@ -175,9 +175,8 @@ typedef struct XenPTMSIXEntry { - int pirq; - uint64_t addr; - uint32_t data; -- uint32_t vector_ctrl; -+ uint32_t latch[4]; - bool updated; /* indicate whether MSI ADDR or DATA is updated */ -- bool warned; /* avoid issuing (bogus) warning more than once */ - } XenPTMSIXEntry; - typedef struct XenPTMSIX { - uint32_t ctrl_offset; ---- trunk.orig/tools/qemu-xen-dir-remote/hw/xen/xen_pt_msi.c 2015-06-10 00:00:00.000000000 +0200 -+++ trunk/tools/qemu-xen-dir-remote/hw/xen/xen_pt_msi.c 2015-05-07 12:46:09.000000000 +0200 -@@ -25,6 +25,7 @@ - #define XEN_PT_GFLAGSSHIFT_DELIV_MODE 12 - #define XEN_PT_GFLAGSSHIFT_TRG_MODE 15 - -+#define latch(fld) latch[PCI_MSIX_ENTRY_##fld / sizeof(uint32_t)] - - /* - * Helpers -@@ -322,6 +323,13 @@ static int xen_pt_msix_update_one(XenPCI - - pirq = entry->pirq; - -+ if (pirq == XEN_PT_UNASSIGNED_PIRQ || s->msix->maskall || -+ (entry->latch(VECTOR_CTRL) & PCI_MSIX_ENTRY_CTRL_MASKBIT)) { -+ entry->addr = entry->latch(LOWER_ADDR) | -+ ((uint64_t)entry->latch(UPPER_ADDR) << 32); -+ entry->data = entry->latch(DATA); -+ } -+ - rc = msi_msix_setup(s, entry->addr, entry->data, &pirq, true, entry_nr, - entry->pirq == XEN_PT_UNASSIGNED_PIRQ); - if (rc) { -@@ -396,35 +404,15 @@ int xen_pt_msix_update_remap(XenPCIPasst - - static uint32_t get_entry_value(XenPTMSIXEntry *e, int offset) - { -- switch (offset) { -- case PCI_MSIX_ENTRY_LOWER_ADDR: -- return e->addr & UINT32_MAX; -- case PCI_MSIX_ENTRY_UPPER_ADDR: -- return e->addr >> 32; -- case PCI_MSIX_ENTRY_DATA: -- return e->data; -- case PCI_MSIX_ENTRY_VECTOR_CTRL: -- return e->vector_ctrl; -- default: -- return 0; -- } -+ return !(offset % sizeof(*e->latch)) -+ ? e->latch[offset / sizeof(*e->latch)] : 0; - } - - static void set_entry_value(XenPTMSIXEntry *e, int offset, uint32_t val) - { -- switch (offset) { -- case PCI_MSIX_ENTRY_LOWER_ADDR: -- e->addr = (e->addr & ((uint64_t)UINT32_MAX << 32)) | val; -- break; -- case PCI_MSIX_ENTRY_UPPER_ADDR: -- e->addr = (uint64_t)val << 32 | (e->addr & UINT32_MAX); -- break; -- case PCI_MSIX_ENTRY_DATA: -- e->data = val; -- break; -- case PCI_MSIX_ENTRY_VECTOR_CTRL: -- e->vector_ctrl = val; -- break; -+ if (!(offset % sizeof(*e->latch))) -+ { -+ e->latch[offset / sizeof(*e->latch)] = val; - } - } - -@@ -444,39 +432,28 @@ static void pci_msix_write(void *opaque, - offset = addr % PCI_MSIX_ENTRY_SIZE; - - if (offset != PCI_MSIX_ENTRY_VECTOR_CTRL) { -- const volatile uint32_t *vec_ctrl; -- - if (get_entry_value(entry, offset) == val - && entry->pirq != XEN_PT_UNASSIGNED_PIRQ) { - return; - } - -+ entry->updated = true; -+ } else if (msix->enabled && entry->updated && -+ !(val & PCI_MSIX_ENTRY_CTRL_MASKBIT)) { -+ const volatile uint32_t *vec_ctrl; -+ - /* - * If Xen intercepts the mask bit access, entry->vec_ctrl may not be - * up-to-date. Read from hardware directly. - */ - vec_ctrl = s->msix->phys_iomem_base + entry_nr * PCI_MSIX_ENTRY_SIZE - + PCI_MSIX_ENTRY_VECTOR_CTRL; -+ set_entry_value(entry, offset, *vec_ctrl); - -- if (msix->enabled && !(*vec_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)) { -- if (!entry->warned) { -- entry->warned = true; -- XEN_PT_ERR(&s->dev, "Can't update msix entry %d since MSI-X is" -- " already enabled.\n", entry_nr); -- } -- return; -- } -- -- entry->updated = true; -+ xen_pt_msix_update_one(s, entry_nr); - } - - set_entry_value(entry, offset, val); -- -- if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL) { -- if (msix->enabled && !(val & PCI_MSIX_ENTRY_CTRL_MASKBIT)) { -- xen_pt_msix_update_one(s, entry_nr); -- } -- } - } - - static uint64_t pci_msix_read(void *opaque, hwaddr addr, diff --git a/x86-MSI-X-enable.patch b/x86-MSI-X-enable.patch index eef482a..09c4171 100644 --- a/x86-MSI-X-enable.patch +++ b/x86-MSI-X-enable.patch @@ -1,5 +1,3 @@ -References: bsc#907514 bsc#910258 bsc#918984 bsc#923967 - x86/MSI-X: access MSI-X table only after having enabled MSI-X As done in Linux by f598282f51 ("PCI: Fix the NIU MSI-X problem in a @@ -9,12 +7,10 @@ instead to prevent interrupts from occurring. Signed-off-by: Jan Beulich Reviewed-by: Andrew Cooper ---- -v3: temporarily enable MSI-X in setup_msi_irq() if not already enabled ---- trunk.orig/xen/arch/x86/msi.c 2015-03-25 09:35:38.000000000 +0100 -+++ trunk/xen/arch/x86/msi.c 2015-05-18 11:39:36.000000000 +0200 -@@ -142,6 +142,17 @@ static bool_t memory_decoded(const struc +--- sle12sp1.orig/xen/arch/x86/msi.c 2015-07-07 18:01:16.000000000 +0200 ++++ sle12sp1/xen/arch/x86/msi.c 2015-07-07 18:01:41.000000000 +0200 +@@ -144,6 +144,17 @@ static bool_t memory_decoded(const struc PCI_COMMAND_MEMORY); } @@ -32,9 +28,9 @@ v3: temporarily enable MSI-X in setup_msi_irq() if not already enabled /* * MSI message composition */ -@@ -219,7 +230,8 @@ static bool_t read_msi_msg(struct msi_de - void __iomem *base; - base = entry->mask_base; +@@ -222,7 +233,8 @@ static bool_t read_msi_msg(struct msi_de + { + void __iomem *base = entry->mask_base; - if ( unlikely(!memory_decoded(entry->dev)) ) + if ( unlikely(!msix_memory_decoded(entry->dev, @@ -42,9 +38,9 @@ v3: temporarily enable MSI-X in setup_msi_irq() if not already enabled return 0; msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); -@@ -285,7 +297,8 @@ static int write_msi_msg(struct msi_desc - void __iomem *base; - base = entry->mask_base; +@@ -287,7 +299,8 @@ static int write_msi_msg(struct msi_desc + { + void __iomem *base = entry->mask_base; - if ( unlikely(!memory_decoded(entry->dev)) ) + if ( unlikely(!msix_memory_decoded(entry->dev, @@ -52,26 +48,33 @@ v3: temporarily enable MSI-X in setup_msi_irq() if not already enabled return -ENXIO; writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); -@@ -379,7 +392,7 @@ static bool_t msi_set_mask_bit(struct ir +@@ -381,9 +394,9 @@ static bool_t msi_set_mask_bit(struct ir { struct msi_desc *entry = desc->msi_desc; struct pci_dev *pdev; - u16 seg; + u16 seg, control; u8 bus, slot, func; +- bool_t flag = host || guest; ++ bool_t flag = host || guest, maskall; ASSERT(spin_is_locked(&desc->lock)); -@@ -401,35 +414,38 @@ static bool_t msi_set_mask_bit(struct ir + BUG_ON(!entry || !entry->dev); +@@ -406,36 +419,45 @@ static bool_t msi_set_mask_bit(struct ir } break; case PCI_CAP_ID_MSIX: ++ maskall = pdev->msix->host_maskall; + control = pci_conf_read16(seg, bus, slot, func, + msix_control_reg(entry->msi_attrib.pos)); + if ( unlikely(!(control & PCI_MSIX_FLAGS_ENABLE)) ) ++ { ++ pdev->msix->host_maskall = 1; + pci_conf_write16(seg, bus, slot, func, + msix_control_reg(entry->msi_attrib.pos), + control | (PCI_MSIX_FLAGS_ENABLE | + PCI_MSIX_FLAGS_MASKALL)); ++ } if ( likely(memory_decoded(pdev)) ) { writel(flag, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET); @@ -87,6 +90,7 @@ v3: temporarily enable MSI-X in setup_msi_irq() if not already enabled - u16 control; domid_t domid = pdev->domain->domain_id; +- pdev->msix->host_maskall = 1; - control = pci_conf_read16(seg, bus, slot, func, - msix_control_reg(entry->msi_attrib.pos)); - if ( control & PCI_MSIX_FLAGS_MASKALL ) @@ -94,7 +98,7 @@ v3: temporarily enable MSI-X in setup_msi_irq() if not already enabled - pci_conf_write16(seg, bus, slot, func, - msix_control_reg(entry->msi_attrib.pos), - control | PCI_MSIX_FLAGS_MASKALL); -+ control |= PCI_MSIX_FLAGS_MASKALL; ++ maskall = 1; if ( pdev->msix->warned != domid ) { pdev->msix->warned = domid; @@ -107,13 +111,16 @@ v3: temporarily enable MSI-X in setup_msi_irq() if not already enabled - break; } - /* fall through */ ++ pdev->msix->host_maskall = maskall; ++ if ( maskall || pdev->msix->guest_maskall ) ++ control |= PCI_MSIX_FLAGS_MASKALL; + pci_conf_write16(seg, bus, slot, func, + msix_control_reg(entry->msi_attrib.pos), control); + return flag; default: return 0; } -@@ -454,7 +470,8 @@ static int msi_get_mask_bit(const struct +@@ -461,7 +483,8 @@ static int msi_get_mask_bit(const struct entry->msi.mpos) >> entry->msi_attrib.entry_nr) & 1; case PCI_CAP_ID_MSIX: @@ -123,7 +130,7 @@ v3: temporarily enable MSI-X in setup_msi_irq() if not already enabled break; return readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) & 1; } -@@ -543,9 +560,31 @@ static struct msi_desc *alloc_msi_entry( +@@ -564,9 +587,31 @@ static struct msi_desc *alloc_msi_entry( int setup_msi_irq(struct irq_desc *desc, struct msi_desc *msidesc) { @@ -158,9 +165,14 @@ v3: temporarily enable MSI-X in setup_msi_irq() if not already enabled } int __setup_msi_irq(struct irq_desc *desc, struct msi_desc *msidesc, -@@ -785,16 +824,32 @@ static int msix_capability_init(struct p +@@ -803,20 +848,38 @@ static int msix_capability_init(struct p + u8 bus = dev->bus; + u8 slot = PCI_SLOT(dev->devfn); + u8 func = PCI_FUNC(dev->devfn); ++ bool_t maskall = msix->host_maskall; + + ASSERT(spin_is_locked(&pcidevs_lock)); - pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX); control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos)); - msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */ + /* @@ -169,6 +181,7 @@ v3: temporarily enable MSI-X in setup_msi_irq() if not already enabled + * to mask all the vectors to prevent interrupts coming in before they're + * fully set up. + */ ++ msix->host_maskall = 1; + pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), + control | (PCI_MSIX_FLAGS_ENABLE | + PCI_MSIX_FLAGS_MASKALL)); @@ -192,7 +205,7 @@ v3: temporarily enable MSI-X in setup_msi_irq() if not already enabled ASSERT(msi); } -@@ -825,6 +880,8 @@ static int msix_capability_init(struct p +@@ -847,6 +910,8 @@ static int msix_capability_init(struct p { if ( !msi || !msi->table_base ) { @@ -201,7 +214,7 @@ v3: temporarily enable MSI-X in setup_msi_irq() if not already enabled xfree(entry); return -ENXIO; } -@@ -867,6 +924,8 @@ static int msix_capability_init(struct p +@@ -889,6 +954,8 @@ static int msix_capability_init(struct p if ( idx < 0 ) { @@ -210,29 +223,57 @@ v3: temporarily enable MSI-X in setup_msi_irq() if not already enabled xfree(entry); return idx; } -@@ -922,8 +981,7 @@ static int msix_capability_init(struct p +@@ -915,7 +982,7 @@ static int msix_capability_init(struct p + + if ( !msix->used_entries ) + { +- msix->host_maskall = 0; ++ maskall = 0; + if ( !msix->guest_maskall ) + control &= ~PCI_MSIX_FLAGS_MASKALL; + else +@@ -951,8 +1018,8 @@ static int msix_capability_init(struct p ++msix->used_entries; /* Restore MSI-X enabled bits */ - pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), - control & ~PCI_MSIX_FLAGS_MASKALL); ++ msix->host_maskall = maskall; + pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control); return 0; } -@@ -1072,7 +1130,10 @@ static void __pci_disable_msix(struct ms +@@ -1092,8 +1159,15 @@ static void __pci_disable_msix(struct ms + PCI_CAP_ID_MSIX); + u16 control = pci_conf_read16(seg, bus, slot, func, + msix_control_reg(entry->msi_attrib.pos)); ++ bool_t maskall = dev->msix->host_maskall; - pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX); - control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos)); - msix_set_enable(dev, 0); + if ( unlikely(!(control & PCI_MSIX_FLAGS_ENABLE)) ) ++ { ++ dev->msix->host_maskall = 1; + pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), + control | (PCI_MSIX_FLAGS_ENABLE | + PCI_MSIX_FLAGS_MASKALL)); ++ } BUG_ON(list_empty(&dev->msi_list)); -@@ -1198,6 +1259,8 @@ int pci_restore_msi_state(struct pci_dev +@@ -1105,8 +1179,11 @@ static void __pci_disable_msix(struct ms + "cannot disable IRQ %d: masking MSI-X on %04x:%02x:%02x.%u\n", + entry->irq, dev->seg, dev->bus, + PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn)); +- control |= PCI_MSIX_FLAGS_MASKALL; ++ maskall = 1; + } ++ dev->msix->host_maskall = maskall; ++ if ( maskall || dev->msix->guest_maskall ) ++ control |= PCI_MSIX_FLAGS_MASKALL; + pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control); + + _pci_cleanup_msix(dev->msix); +@@ -1255,6 +1332,8 @@ int pci_restore_msi_state(struct pci_dev list_for_each_entry_safe( entry, tmp, &pdev->msi_list, list ) { unsigned int i = 0, nr = 1; @@ -241,7 +282,7 @@ v3: temporarily enable MSI-X in setup_msi_irq() if not already enabled irq = entry->irq; desc = &irq_desc[irq]; -@@ -1224,10 +1287,18 @@ int pci_restore_msi_state(struct pci_dev +@@ -1281,10 +1360,18 @@ int pci_restore_msi_state(struct pci_dev } else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX ) { @@ -261,7 +302,7 @@ v3: temporarily enable MSI-X in setup_msi_irq() if not already enabled return -ENXIO; } } -@@ -1256,11 +1327,9 @@ int pci_restore_msi_state(struct pci_dev +@@ -1314,11 +1401,9 @@ int pci_restore_msi_state(struct pci_dev if ( entry->msi_attrib.type == PCI_CAP_ID_MSI ) { unsigned int cpos = msi_control_reg(entry->msi_attrib.pos); @@ -275,7 +316,7 @@ v3: temporarily enable MSI-X in setup_msi_irq() if not already enabled multi_msi_enable(control, entry->msi.nvec); pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn), cpos, control); -@@ -1268,7 +1337,9 @@ int pci_restore_msi_state(struct pci_dev +@@ -1326,7 +1411,9 @@ int pci_restore_msi_state(struct pci_dev msi_set_enable(pdev, 1); } else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX ) diff --git a/x86-MSI-X-maskall.patch b/x86-MSI-X-maskall.patch index 2ea7f54..f40cc21 100644 --- a/x86-MSI-X-maskall.patch +++ b/x86-MSI-X-maskall.patch @@ -1,308 +1,65 @@ -References: bsc#907514 bsc#910258 bsc#918984 bsc#923967 +x86/MSI-X: track host and guest mask-all requests separately -x86/MSI-X: provide hypercall interface for mask-all control - -Qemu shouldn't be fiddling with this bit directly, as the hypervisor -may (and now does) use it for its own purposes. Provide it with a -replacement interface, allowing the hypervisor to track host and guest -masking intentions independently (clearing the bit only when both want -it clear). +Host uses of the bits will be added subsequently, and must not be +overridden by guests (including Dom0, namely when acting on behalf of +a guest). Signed-off-by: Jan Beulich ---- -Whether the permission check should really be an XSM_TARGET one needs -to be determined: That allowing the guest to issue the hypercalls on -itself means permitting it to bypass the device model, and thus render -device model state stale. +Reviewed-by: Andrew Cooper ---- trunk.orig/tools/libxc/include/xenctrl.h 2015-01-14 18:44:18.000000000 +0100 -+++ trunk/tools/libxc/include/xenctrl.h 2015-03-25 13:51:05.000000000 +0100 -@@ -1793,6 +1793,17 @@ int xc_physdev_unmap_pirq(xc_interface * - int domid, - int pirq); - -+int xc_physdev_msix_enable(xc_interface *xch, -+ int segment, -+ int bus, -+ int devfn, -+ int on); -+int xc_physdev_msix_mask_all(xc_interface *xch, -+ int segment, -+ int bus, -+ int devfn, -+ int mask); -+ - int xc_hvm_set_pci_intx_level( - xc_interface *xch, domid_t dom, - uint8_t domain, uint8_t bus, uint8_t device, uint8_t intx, ---- trunk.orig/tools/libxc/xc_physdev.c 2013-07-09 20:57:12.000000000 +0200 -+++ trunk/tools/libxc/xc_physdev.c 2015-03-24 15:59:43.000000000 +0100 -@@ -108,3 +108,38 @@ int xc_physdev_unmap_pirq(xc_interface * - return rc; - } - -+int xc_physdev_msix_enable(xc_interface *xch, -+ int segment, -+ int bus, -+ int devfn, -+ int on) -+{ -+ struct physdev_pci_device dev = { -+ .seg = segment, -+ .bus = bus, -+ .devfn = devfn -+ }; -+ -+ return do_physdev_op(xch, -+ on ? PHYSDEVOP_msix_enable -+ : PHYSDEVOP_msix_disable, -+ &dev, sizeof(dev)); -+} -+ -+int xc_physdev_msix_mask_all(xc_interface *xch, -+ int segment, -+ int bus, -+ int devfn, -+ int mask) -+{ -+ struct physdev_pci_device dev = { -+ .seg = segment, -+ .bus = bus, -+ .devfn = devfn -+ }; -+ -+ return do_physdev_op(xch, -+ mask ? PHYSDEVOP_msix_mask_all -+ : PHYSDEVOP_msix_unmask_all, -+ &dev, sizeof(dev)); -+} ---- trunk.orig/xen/arch/x86/msi.c 2015-05-18 11:44:39.000000000 +0200 -+++ trunk/xen/arch/x86/msi.c 2015-06-10 12:53:52.000000000 +0200 -@@ -394,7 +394,7 @@ static bool_t msi_set_mask_bit(struct ir - struct pci_dev *pdev; - u16 seg, control; - u8 bus, slot, func; -- bool_t flag = host || guest; -+ bool_t flag = host || guest, maskall; - - ASSERT(spin_is_locked(&desc->lock)); - BUG_ON(!entry || !entry->dev); -@@ -415,13 +415,17 @@ static bool_t msi_set_mask_bit(struct ir - } - break; - case PCI_CAP_ID_MSIX: -+ maskall = pdev->msix->host_maskall; - control = pci_conf_read16(seg, bus, slot, func, - msix_control_reg(entry->msi_attrib.pos)); - if ( unlikely(!(control & PCI_MSIX_FLAGS_ENABLE)) ) -+ { -+ pdev->msix->host_maskall = 1; - pci_conf_write16(seg, bus, slot, func, - msix_control_reg(entry->msi_attrib.pos), - control | (PCI_MSIX_FLAGS_ENABLE | - PCI_MSIX_FLAGS_MASKALL)); -+ } - if ( likely(memory_decoded(pdev)) ) - { - writel(flag, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET); -@@ -434,7 +438,7 @@ static bool_t msi_set_mask_bit(struct ir - { - domid_t domid = pdev->domain->domain_id; - -- control |= PCI_MSIX_FLAGS_MASKALL; -+ maskall = 1; - if ( pdev->msix->warned != domid ) - { - pdev->msix->warned = domid; -@@ -444,6 +448,9 @@ static bool_t msi_set_mask_bit(struct ir - PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); - } - } -+ pdev->msix->host_maskall = maskall; -+ if ( maskall || pdev->msix->guest_maskall ) -+ control |= PCI_MSIX_FLAGS_MASKALL; - pci_conf_write16(seg, bus, slot, func, - msix_control_reg(entry->msi_attrib.pos), control); - return flag; -@@ -839,6 +846,7 @@ static int msix_capability_init(struct p - u8 bus = dev->bus; - u8 slot = PCI_SLOT(dev->devfn); - u8 func = PCI_FUNC(dev->devfn); -+ bool_t maskall = msix->host_maskall; - - ASSERT(spin_is_locked(&pcidevs_lock)); - -@@ -850,6 +858,7 @@ static int msix_capability_init(struct p - * to mask all the vectors to prevent interrupts coming in before they're - * fully set up. - */ -+ msix->host_maskall = 1; - pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), - control | (PCI_MSIX_FLAGS_ENABLE | - PCI_MSIX_FLAGS_MASKALL)); -@@ -972,6 +981,10 @@ static int msix_capability_init(struct p +--- sle12sp1.orig/xen/arch/x86/msi.c 2015-06-22 09:06:30.000000000 +0200 ++++ sle12sp1/xen/arch/x86/msi.c 2015-06-22 09:23:08.000000000 +0200 +@@ -843,6 +843,12 @@ static int msix_capability_init(struct p if ( !msix->used_entries ) { -+ maskall = 0; -+ msix->guest_maskall = 0; -+ control &= ~PCI_MSIX_FLAGS_MASKALL; ++ msix->host_maskall = 0; ++ if ( !msix->guest_maskall ) ++ control &= ~PCI_MSIX_FLAGS_MASKALL; ++ else ++ control |= PCI_MSIX_FLAGS_MASKALL; + if ( rangeset_add_range(mmio_ro_ranges, msix->table.first, msix->table.last) ) WARN(); -@@ -1002,6 +1015,7 @@ static int msix_capability_init(struct p - ++msix->used_entries; - - /* Restore MSI-X enabled bits */ -+ msix->host_maskall = maskall; - pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control); - - return 0; -@@ -1142,6 +1156,7 @@ static void __pci_disable_msix(struct ms - int pos; - u16 control, seg; - u8 bus, slot, func; -+ bool_t maskall; - - dev = entry->dev; - seg = dev->seg; -@@ -1151,10 +1166,14 @@ static void __pci_disable_msix(struct ms - - pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX); - control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos)); -+ maskall = dev->msix->host_maskall; - if ( unlikely(!(control & PCI_MSIX_FLAGS_ENABLE)) ) +@@ -1111,6 +1117,34 @@ void pci_cleanup_msi(struct pci_dev *pde + int pci_msi_conf_write_intercept(struct pci_dev *pdev, unsigned int reg, + unsigned int size, uint32_t *data) + { ++ u16 seg = pdev->seg; ++ u8 bus = pdev->bus; ++ u8 slot = PCI_SLOT(pdev->devfn); ++ u8 func = PCI_FUNC(pdev->devfn); ++ struct msi_desc *entry; ++ unsigned int pos; ++ ++ if ( pdev->msix ) + { -+ dev->msix->host_maskall = 1; - pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), - control | (PCI_MSIX_FLAGS_ENABLE | - PCI_MSIX_FLAGS_MASKALL)); ++ entry = find_msi_entry(pdev, -1, PCI_CAP_ID_MSIX); ++ pos = entry ? entry->msi_attrib.pos ++ : pci_find_cap_offset(seg, bus, slot, func, ++ PCI_CAP_ID_MSIX); ++ ASSERT(pos); ++ ++ if ( reg < pos || reg >= msix_pba_offset_reg(pos) + 4 ) ++ return 0; ++ ++ if ( reg != msix_control_reg(pos) || size != 2 ) ++ return -EACCES; ++ ++ pdev->msix->guest_maskall = !!(*data & PCI_MSIX_FLAGS_MASKALL); ++ if ( pdev->msix->host_maskall ) ++ *data |= PCI_MSIX_FLAGS_MASKALL; ++ ++ return 1; + } - - BUG_ON(list_empty(&dev->msi_list)); - -@@ -1166,8 +1185,11 @@ static void __pci_disable_msix(struct ms - "cannot disable IRQ %d: masking MSI-X on %04x:%02x:%02x.%u\n", - entry->irq, dev->seg, dev->bus, - PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn)); -- control |= PCI_MSIX_FLAGS_MASKALL; -+ maskall = 1; - } -+ dev->msix->host_maskall = maskall; -+ if ( maskall || dev->msix->guest_maskall ) -+ control |= PCI_MSIX_FLAGS_MASKALL; - pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control); - - _pci_cleanup_msix(dev->msix); -@@ -1211,6 +1233,62 @@ int pci_prepare_msix(u16 seg, u8 bus, u8 - return rc; ++ + return 0; } -+int pci_msix_enable(u16 seg, u8 bus, u8 devfn, bool_t on) -+{ -+ int rc; -+ struct pci_dev *pdev; -+ -+ if ( !use_msi ) -+ return -EOPNOTSUPP; -+ -+ spin_lock(&pcidevs_lock); -+ pdev = pci_get_pdev(seg, bus, devfn); -+ if ( !pdev || !pdev->msix || !pdev->domain ) -+ rc = -ENODEV; -+ else if ( !is_hvm_domain(pdev->domain) ) -+ rc = -ENXIO; -+ else if ( (rc = xsm_manage_domain_pirq(XSM_TARGET, pdev->domain)) == 0 ) -+ msix_set_enable(pdev, on); -+ spin_unlock(&pcidevs_lock); -+ -+ return rc; -+} -+ -+int pci_msix_maskall(u16 seg, u8 bus, u8 devfn, bool_t mask) -+{ -+ int rc; -+ struct pci_dev *pdev; -+ u8 slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn); -+ -+ if ( !use_msi ) -+ return -EOPNOTSUPP; -+ -+ spin_lock(&pcidevs_lock); -+ pdev = pci_get_pdev(seg, bus, devfn); -+ if ( !pdev || !pdev->msix || !pdev->domain ) -+ rc = -ENODEV; -+ else if ( !is_hvm_domain(pdev->domain) ) -+ rc = -ENXIO; -+ else if ( (rc = xsm_manage_domain_pirq(XSM_TARGET, pdev->domain)) == 0 ) -+ { -+ unsigned int pos = pci_find_cap_offset(seg, bus, slot, func, -+ PCI_CAP_ID_MSIX); -+ u16 control = pci_conf_read16(seg, bus, slot, func, -+ msix_control_reg(pos)); -+ -+ BUG_ON(!pos); -+ pdev->msix->guest_maskall = mask; -+ if ( pdev->msix->host_maskall || mask ) -+ control |= PCI_MSIX_FLAGS_MASKALL; -+ else -+ control &= ~PCI_MSIX_FLAGS_MASKALL; -+ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control); -+ } -+ spin_unlock(&pcidevs_lock); -+ -+ return rc; -+} -+ - /* - * Notice: only construct the msi_desc - * no change to irq_desc here, and the interrupt is masked ---- trunk.orig/xen/arch/x86/physdev.c 2015-01-14 18:44:18.000000000 +0100 -+++ trunk/xen/arch/x86/physdev.c 2015-03-25 14:02:24.000000000 +0100 -@@ -648,6 +648,30 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H - break; - } - -+ case PHYSDEVOP_msix_enable: -+ case PHYSDEVOP_msix_disable: { -+ struct physdev_pci_device dev; -+ -+ if ( copy_from_guest(&dev, arg, 1) ) -+ ret = -EFAULT; -+ else -+ ret = pci_msix_enable(dev.seg, dev.bus, dev.devfn, -+ cmd == PHYSDEVOP_msix_enable); -+ break; -+ } -+ -+ case PHYSDEVOP_msix_mask_all: -+ case PHYSDEVOP_msix_unmask_all: { -+ struct physdev_pci_device dev; -+ -+ if ( copy_from_guest(&dev, arg, 1) ) -+ ret = -EFAULT; -+ else -+ ret = pci_msix_maskall(dev.seg, dev.bus, dev.devfn, -+ cmd == PHYSDEVOP_msix_mask_all); -+ break; -+ } -+ - case PHYSDEVOP_pci_mmcfg_reserved: { - struct physdev_pci_mmcfg_reserved info; - ---- trunk.orig/xen/include/asm-x86/msi.h 2015-03-09 09:42:49.000000000 +0100 -+++ trunk/xen/include/asm-x86/msi.h 2015-03-25 14:01:00.000000000 +0100 -@@ -78,6 +78,8 @@ struct msi_desc; - extern int pci_enable_msi(struct msi_info *msi, struct msi_desc **desc); - extern void pci_disable_msi(struct msi_desc *desc); - extern int pci_prepare_msix(u16 seg, u8 bus, u8 devfn, bool_t off); -+extern int pci_msix_enable(u16 seg, u8 bus, u8 devfn, bool_t on); -+extern int pci_msix_maskall(u16 seg, u8 bus, u8 devfn, bool_t mask); - extern void pci_cleanup_msi(struct pci_dev *pdev); - extern int setup_msi_irq(struct irq_desc *, struct msi_desc *); - extern int __setup_msi_irq(struct irq_desc *, struct msi_desc *, -@@ -228,6 +230,7 @@ struct arch_msix { +--- sle12sp1.orig/xen/include/asm-x86/msi.h 2015-07-08 00:00:00.000000000 +0200 ++++ sle12sp1/xen/include/asm-x86/msi.h 2015-06-19 09:32:02.000000000 +0200 +@@ -228,6 +228,7 @@ struct arch_msix { int table_refcnt[MAX_MSIX_TABLE_PAGES]; int table_idx[MAX_MSIX_TABLE_PAGES]; spinlock_t table_lock; @@ -310,89 +67,3 @@ device model state stale. domid_t warned; }; ---- trunk.orig/xen/include/public/physdev.h 2013-12-24 18:25:25.000000000 +0100 -+++ trunk/xen/include/public/physdev.h 2015-03-24 15:54:54.000000000 +0100 -@@ -310,6 +310,14 @@ DEFINE_XEN_GUEST_HANDLE(physdev_pci_devi - */ - #define PHYSDEVOP_prepare_msix 30 - #define PHYSDEVOP_release_msix 31 -+/* -+ * The device model domain for a guest should be using these instead of -+ * fiddling with the respective flags in the MSI-X capability structure. -+ */ -+#define PHYSDEVOP_msix_enable 32 -+#define PHYSDEVOP_msix_disable 33 -+#define PHYSDEVOP_msix_mask_all 34 -+#define PHYSDEVOP_msix_unmask_all 35 - struct physdev_pci_device { - /* IN */ - uint16_t seg; ---- trunk.orig/xen/include/xsm/dummy.h 2015-01-14 18:44:18.000000000 +0100 -+++ trunk/xen/include/xsm/dummy.h 2015-03-23 11:13:16.000000000 +0100 -@@ -439,6 +439,12 @@ static XSM_INLINE int xsm_map_domain_irq - return xsm_default_action(action, current->domain, d); - } - -+static XSM_INLINE int xsm_manage_domain_pirq(XSM_DEFAULT_ARG struct domain *d) -+{ -+ XSM_ASSERT_ACTION(XSM_TARGET); -+ return xsm_default_action(action, current->domain, d); -+} -+ - static XSM_INLINE int xsm_unmap_domain_pirq(XSM_DEFAULT_ARG struct domain *d) - { - XSM_ASSERT_ACTION(XSM_TARGET); ---- trunk.orig/xen/include/xsm/xsm.h 2015-01-14 18:44:18.000000000 +0100 -+++ trunk/xen/include/xsm/xsm.h 2015-05-15 10:28:19.000000000 +0200 -@@ -105,6 +105,7 @@ struct xsm_operations { - char *(*show_irq_sid) (int irq); - int (*map_domain_pirq) (struct domain *d); - int (*map_domain_irq) (struct domain *d, int irq, void *data); -+ int (*manage_domain_pirq) (struct domain *d); - int (*unmap_domain_pirq) (struct domain *d); - int (*unmap_domain_irq) (struct domain *d, int irq, void *data); - int (*irq_permission) (struct domain *d, int pirq, uint8_t allow); -@@ -409,6 +410,11 @@ static inline int xsm_map_domain_irq (xs - return xsm_ops->map_domain_irq(d, irq, data); - } - -+static inline int xsm_manage_domain_pirq(xsm_default_t def, struct domain *d) -+{ -+ return xsm_ops->manage_domain_pirq(d); -+} -+ - static inline int xsm_unmap_domain_pirq (xsm_default_t def, struct domain *d) - { - return xsm_ops->unmap_domain_pirq(d); ---- trunk.orig/xen/xsm/dummy.c 2015-01-14 18:44:18.000000000 +0100 -+++ trunk/xen/xsm/dummy.c 2015-05-15 10:27:35.000000000 +0200 -@@ -79,6 +79,7 @@ void xsm_fixup_ops (struct xsm_operation - set_to_dummy_if_null(ops, show_irq_sid); - set_to_dummy_if_null(ops, map_domain_pirq); - set_to_dummy_if_null(ops, map_domain_irq); -+ set_to_dummy_if_null(ops, manage_domain_pirq); - set_to_dummy_if_null(ops, unmap_domain_pirq); - set_to_dummy_if_null(ops, unmap_domain_irq); - set_to_dummy_if_null(ops, irq_permission); ---- trunk.orig/xen/xsm/flask/hooks.c 2015-01-14 18:44:18.000000000 +0100 -+++ trunk/xen/xsm/flask/hooks.c 2015-05-15 10:27:50.000000000 +0200 -@@ -875,6 +875,11 @@ static int flask_map_domain_irq (struct - return rc; - } - -+static int flask_manage_domain_pirq(struct domain *d) -+{ -+ return current_has_perm(d, SECCLASS_RESOURCE, RESOURCE__USE); -+} -+ - static int flask_unmap_domain_pirq (struct domain *d) - { - return current_has_perm(d, SECCLASS_RESOURCE, RESOURCE__REMOVE); -@@ -1556,6 +1561,7 @@ static struct xsm_operations flask_ops = - - .map_domain_pirq = flask_map_domain_pirq, - .map_domain_irq = flask_map_domain_irq, -+ .manage_domain_pirq = flask_manage_domain_pirq, - .unmap_domain_pirq = flask_unmap_domain_pirq, - .unmap_domain_irq = flask_unmap_domain_irq, - .irq_permission = flask_irq_permission, diff --git a/x86-MSI-X-teardown.patch b/x86-MSI-X-teardown.patch index be8464b..da6608d 100644 --- a/x86-MSI-X-teardown.patch +++ b/x86-MSI-X-teardown.patch @@ -1,5 +1,3 @@ -References: bsc#907514 bsc#910258 bsc#918984 bsc#923967 - x86/MSI-X: be more careful during teardown When a device gets detached from a guest, pciback will clear its @@ -21,19 +19,13 @@ common) system behavior. Signed-off-by: Jan Beulich Reviewed-by: Andrew Cooper --- -The use of the mask-all bit here collides with qemu's incorreect use -of that same bit. This would become a security issue if released that -way. A later patch in this series will provide the infrastructure for -qemu to stop direct access to that bit. A qemu series including a patch -making use of the new interface will be sent subsequently. - Backporting note (largely to myself): Depends on (not yet backported to 4.4 and earlier) commit 061eebe0e "x86/MSI: drop workaround for insecure Dom0 kernels" (due to re-use of struct arch_msix's warned field). ---- trunk.orig/xen/arch/x86/irq.c 2015-06-03 16:55:05.000000000 +0200 -+++ trunk/xen/arch/x86/irq.c 2015-03-25 09:36:52.000000000 +0100 +--- sle12sp1.orig/xen/arch/x86/irq.c 2015-07-08 11:47:52.000000000 +0200 ++++ sle12sp1/xen/arch/x86/irq.c 2015-07-07 18:01:32.000000000 +0200 @@ -217,9 +217,9 @@ void destroy_irq(unsigned int irq) } @@ -65,9 +57,9 @@ Backporting note (largely to myself): /* * Mark any remaining pending EOIs as ready to flush. ---- trunk.orig/xen/arch/x86/msi.c 2015-05-19 23:16:48.000000000 +0200 -+++ trunk/xen/arch/x86/msi.c 2015-03-25 09:35:38.000000000 +0100 -@@ -121,6 +121,27 @@ static void msix_put_fixmap(struct arch_ +--- sle12sp1.orig/xen/arch/x86/msi.c 2015-06-22 09:23:08.000000000 +0200 ++++ sle12sp1/xen/arch/x86/msi.c 2015-07-07 18:01:16.000000000 +0200 +@@ -123,6 +123,27 @@ static void msix_put_fixmap(struct arch_ spin_unlock(&msix->table_lock); } @@ -95,7 +87,7 @@ Backporting note (largely to myself): /* * MSI message composition */ -@@ -162,7 +183,7 @@ void msi_compose_msg(unsigned vector, co +@@ -166,7 +187,7 @@ void msi_compose_msg(unsigned vector, co } } @@ -104,16 +96,16 @@ Backporting note (largely to myself): { switch ( entry->msi_attrib.type ) { -@@ -198,6 +219,8 @@ static void read_msi_msg(struct msi_desc - void __iomem *base; - base = entry->mask_base; +@@ -201,6 +222,8 @@ static void read_msi_msg(struct msi_desc + { + void __iomem *base = entry->mask_base; + if ( unlikely(!memory_decoded(entry->dev)) ) + return 0; msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET); -@@ -209,6 +232,8 @@ static void read_msi_msg(struct msi_desc +@@ -212,6 +235,8 @@ static void read_msi_msg(struct msi_desc if ( iommu_intremap ) iommu_read_msi_from_ire(entry, msg); @@ -122,16 +114,16 @@ Backporting note (largely to myself): } static int write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) -@@ -260,6 +285,8 @@ static int write_msi_msg(struct msi_desc - void __iomem *base; - base = entry->mask_base; +@@ -262,6 +287,8 @@ static int write_msi_msg(struct msi_desc + { + void __iomem *base = entry->mask_base; + if ( unlikely(!memory_decoded(entry->dev)) ) + return -ENXIO; writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); writel(msg->address_hi, -@@ -287,7 +314,8 @@ void set_msi_affinity(struct irq_desc *d +@@ -289,7 +316,8 @@ void set_msi_affinity(struct irq_desc *d ASSERT(spin_is_locked(&desc->lock)); memset(&msg, 0, sizeof(msg)); @@ -141,17 +133,18 @@ Backporting note (largely to myself): msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(desc->arch.vector); -@@ -347,20 +375,24 @@ int msi_maskable_irq(const struct msi_de +@@ -349,23 +377,27 @@ int msi_maskable_irq(const struct msi_de || entry->msi_attrib.maskbit; } --static void msi_set_mask_bit(struct irq_desc *desc, int flag) -+static bool_t msi_set_mask_bit(struct irq_desc *desc, int flag) +-static void msi_set_mask_bit(struct irq_desc *desc, bool_t host, bool_t guest) ++static bool_t msi_set_mask_bit(struct irq_desc *desc, bool_t host, bool_t guest) { struct msi_desc *entry = desc->msi_desc; + struct pci_dev *pdev; + u16 seg; + u8 bus, slot, func; + bool_t flag = host || guest; ASSERT(spin_is_locked(&desc->lock)); BUG_ON(!entry || !entry->dev); @@ -160,9 +153,11 @@ Backporting note (largely to myself): + bus = pdev->bus; + slot = PCI_SLOT(pdev->devfn); + func = PCI_FUNC(pdev->devfn); - switch (entry->msi_attrib.type) { + switch ( entry->msi_attrib.type ) + { case PCI_CAP_ID_MSI: - if (entry->msi_attrib.maskbit) { + if ( entry->msi_attrib.maskbit ) + { u32 mask_bits; - u16 seg = entry->dev->seg; - u8 bus = entry->dev->bus; @@ -171,7 +166,7 @@ Backporting note (largely to myself): mask_bits = pci_conf_read32(seg, bus, slot, func, entry->msi.mpos); mask_bits &= ~((u32)1 << entry->msi_attrib.entry_nr); -@@ -369,24 +401,52 @@ static void msi_set_mask_bit(struct irq_ +@@ -374,25 +406,54 @@ static void msi_set_mask_bit(struct irq_ } break; case PCI_CAP_ID_MSIX: @@ -192,6 +187,7 @@ Backporting note (largely to myself): + u16 control; + domid_t domid = pdev->domain->domain_id; + ++ pdev->msix->host_maskall = 1; + control = pci_conf_read16(seg, bus, slot, func, + msix_control_reg(entry->msi_attrib.pos)); + if ( control & PCI_MSIX_FLAGS_MASKALL ) @@ -215,7 +211,8 @@ Backporting note (largely to myself): - break; + return 0; } - entry->msi_attrib.masked = !!flag; + entry->msi_attrib.host_masked = host; + entry->msi_attrib.guest_masked = guest; + + return 1; } @@ -234,7 +231,7 @@ Backporting note (largely to myself): break; return (pci_conf_read32(entry->dev->seg, entry->dev->bus, PCI_SLOT(entry->dev->devfn), -@@ -394,6 +454,8 @@ static int msi_get_mask_bit(const struct +@@ -400,6 +461,8 @@ static int msi_get_mask_bit(const struct entry->msi.mpos) >> entry->msi_attrib.entry_nr) & 1; case PCI_CAP_ID_MSIX: @@ -243,24 +240,44 @@ Backporting note (largely to myself): return readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) & 1; } return -1; -@@ -401,12 +463,14 @@ static int msi_get_mask_bit(const struct +@@ -407,12 +470,16 @@ static int msi_get_mask_bit(const struct void mask_msi_irq(struct irq_desc *desc) { -- msi_set_mask_bit(desc, 1); -+ if ( unlikely(!msi_set_mask_bit(desc, 1)) ) +- msi_set_mask_bit(desc, 1, desc->msi_desc->msi_attrib.guest_masked); ++ if ( unlikely(!msi_set_mask_bit(desc, 1, ++ desc->msi_desc->msi_attrib.guest_masked)) ) + BUG_ON(!(desc->status & IRQ_DISABLED)); } void unmask_msi_irq(struct irq_desc *desc) { -- msi_set_mask_bit(desc, 0); -+ if ( unlikely(!msi_set_mask_bit(desc, 0)) ) +- msi_set_mask_bit(desc, 0, desc->msi_desc->msi_attrib.guest_masked); ++ if ( unlikely(!msi_set_mask_bit(desc, 0, ++ desc->msi_desc->msi_attrib.guest_masked)) ) + WARN(); } + void guest_mask_msi_irq(struct irq_desc *desc, bool_t mask) +@@ -422,13 +489,15 @@ void guest_mask_msi_irq(struct irq_desc + static unsigned int startup_msi_irq(struct irq_desc *desc) -@@ -723,6 +787,9 @@ static int msix_capability_init(struct p + { +- msi_set_mask_bit(desc, 0, !!(desc->status & IRQ_GUEST)); ++ if ( unlikely(!msi_set_mask_bit(desc, 0, !!(desc->status & IRQ_GUEST))) ) ++ WARN(); + return 0; + } + + static void shutdown_msi_irq(struct irq_desc *desc) + { +- msi_set_mask_bit(desc, 1, 1); ++ if ( unlikely(!msi_set_mask_bit(desc, 1, 1)) ) ++ BUG_ON(!(desc->status & IRQ_DISABLED)); + } + + void ack_nonmaskable_msi_irq(struct irq_desc *desc) +@@ -740,6 +809,9 @@ static int msix_capability_init(struct p control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos)); msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */ @@ -270,7 +287,7 @@ Backporting note (largely to myself): if ( desc ) { entry = alloc_msi_entry(1); -@@ -855,7 +922,8 @@ static int msix_capability_init(struct p +@@ -879,7 +951,8 @@ static int msix_capability_init(struct p ++msix->used_entries; /* Restore MSI-X enabled bits */ @@ -280,7 +297,7 @@ Backporting note (largely to myself): return 0; } -@@ -1008,8 +1076,16 @@ static void __pci_disable_msix(struct ms +@@ -1024,8 +1097,16 @@ static void __pci_disable_msix(struct ms BUG_ON(list_empty(&dev->msi_list)); @@ -299,7 +316,7 @@ Backporting note (largely to myself): pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control); _pci_cleanup_msix(dev->msix); -@@ -1147,14 +1223,23 @@ int pci_restore_msi_state(struct pci_dev +@@ -1199,15 +1280,24 @@ int pci_restore_msi_state(struct pci_dev nr = entry->msi.nvec; } else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX ) @@ -317,9 +334,11 @@ Backporting note (largely to myself): for ( i = 0; ; ) { -- msi_set_mask_bit(desc, entry[i].msi_attrib.masked); +- msi_set_mask_bit(desc, entry[i].msi_attrib.host_masked, +- entry[i].msi_attrib.guest_masked); + if ( unlikely(!msi_set_mask_bit(desc, -+ entry[i].msi_attrib.masked)) ) ++ entry[i].msi_attrib.host_masked, ++ entry[i].msi_attrib.guest_masked)) ) + BUG(); if ( !--nr ) diff --git a/x86-MSI-mask.patch b/x86-MSI-mask.patch new file mode 100644 index 0000000..26502e1 --- /dev/null +++ b/x86-MSI-mask.patch @@ -0,0 +1,48 @@ +x86/MSI: properly track guest masking requests + +... by monitoring writes to the mask register. + +This allows reverting the main effect of the XSA-129 patches in qemu. + +Signed-off-by: Jan Beulich + +--- sle12sp1.orig/xen/arch/x86/msi.c 2015-07-07 18:01:41.000000000 +0200 ++++ sle12sp1/xen/arch/x86/msi.c 2015-07-07 18:01:47.000000000 +0200 +@@ -1303,6 +1303,37 @@ int pci_msi_conf_write_intercept(struct + return 1; + } + ++ entry = find_msi_entry(pdev, -1, PCI_CAP_ID_MSI); ++ if ( entry && entry->msi_attrib.maskbit ) ++ { ++ uint16_t cntl; ++ uint32_t unused; ++ ++ pos = entry->msi_attrib.pos; ++ if ( reg < pos || reg >= entry->msi.mpos + 8 ) ++ return 0; ++ ++ if ( reg == msi_control_reg(pos) ) ++ return size == 2 ? 1 : -EACCES; ++ if ( reg < entry->msi.mpos || reg >= entry->msi.mpos + 4 || size != 4 ) ++ return -EACCES; ++ ++ cntl = pci_conf_read16(seg, bus, slot, func, msi_control_reg(pos)); ++ unused = ~(uint32_t)0 >> (32 - multi_msi_capable(cntl)); ++ for ( pos = 0; pos < entry->msi.nvec; ++pos, ++entry ) ++ { ++ entry->msi_attrib.guest_masked = ++ *data >> entry->msi_attrib.entry_nr; ++ if ( entry->msi_attrib.host_masked ) ++ *data |= 1 << pos; ++ unused &= ~(1 << pos); ++ } ++ ++ *data |= unused; ++ ++ return 1; ++ } ++ + return 0; + } + diff --git a/x86-MSI-pv-unmask.patch b/x86-MSI-pv-unmask.patch new file mode 100644 index 0000000..ccdd02d --- /dev/null +++ b/x86-MSI-pv-unmask.patch @@ -0,0 +1,93 @@ +x86/MSI: fix guest unmasking when handling IRQ via event channel + +Rather than assuming only PV guests need special treatment (and +dealing with that directly when an IRQ gets set up), keep all guest MSI +IRQs masked until either the (HVM) guest unmasks them via vMSI or the +(PV, PVHVM, or PVH) guest sets up an event channel for it. + +To not further clutter the common evtchn_bind_pirq() with x86-specific +code, introduce an arch_evtchn_bind_pirq() hook instead. + +Reported-by: Sander Eikelenboom +Signed-off-by: Jan Beulich +Tested-by: Sander Eikelenboom +Reviewed-by: Andrew Cooper + +--- sle12sp1.orig/xen/arch/x86/irq.c 2015-07-08 12:33:47.000000000 +0200 ++++ sle12sp1/xen/arch/x86/irq.c 2015-07-07 17:04:08.000000000 +0200 +@@ -2502,6 +2502,25 @@ int unmap_domain_pirq_emuirq(struct doma + return ret; + } + ++void arch_evtchn_bind_pirq(struct domain *d, int pirq) ++{ ++ int irq = domain_pirq_to_irq(d, pirq); ++ struct irq_desc *desc; ++ unsigned long flags; ++ ++ if ( irq <= 0 ) ++ return; ++ ++ if ( is_hvm_domain(d) ) ++ map_domain_emuirq_pirq(d, pirq, IRQ_PT); ++ ++ desc = irq_to_desc(irq); ++ spin_lock_irqsave(&desc->lock, flags); ++ if ( desc->msi_desc ) ++ guest_mask_msi_irq(desc, 0); ++ spin_unlock_irqrestore(&desc->lock, flags); ++} ++ + bool_t hvm_domain_use_pirq(const struct domain *d, const struct pirq *pirq) + { + return is_hvm_domain(d) && pirq && +--- sle12sp1.orig/xen/arch/x86/msi.c 2015-07-08 00:00:00.000000000 +0200 ++++ sle12sp1/xen/arch/x86/msi.c 2015-07-07 16:50:02.000000000 +0200 +@@ -422,10 +422,7 @@ void guest_mask_msi_irq(struct irq_desc + + static unsigned int startup_msi_irq(struct irq_desc *desc) + { +- bool_t guest_masked = (desc->status & IRQ_GUEST) && +- is_hvm_domain(desc->msi_desc->dev->domain); +- +- msi_set_mask_bit(desc, 0, guest_masked); ++ msi_set_mask_bit(desc, 0, !!(desc->status & IRQ_GUEST)); + return 0; + } + +--- sle12sp1.orig/xen/common/event_channel.c 2015-07-08 14:04:08.000000000 +0200 ++++ sle12sp1/xen/common/event_channel.c 2015-07-07 16:53:47.000000000 +0200 +@@ -504,10 +504,7 @@ static long evtchn_bind_pirq(evtchn_bind + + bind->port = port; + +-#ifdef CONFIG_X86 +- if ( is_hvm_domain(d) && domain_pirq_to_irq(d, pirq) > 0 ) +- map_domain_emuirq_pirq(d, pirq, IRQ_PT); +-#endif ++ arch_evtchn_bind_pirq(d, pirq); + + out: + spin_unlock(&d->event_lock); +--- sle12sp1.orig/xen/include/asm-arm/irq.h 2015-07-08 12:33:47.000000000 +0200 ++++ sle12sp1/xen/include/asm-arm/irq.h 2015-07-07 17:02:00.000000000 +0200 +@@ -44,6 +44,8 @@ int route_irq_to_guest(struct domain *d, + const char *devname); + void arch_move_irqs(struct vcpu *v); + ++#define arch_evtchn_bind_pirq(d, pirq) ((void)((d) + (pirq))) ++ + /* Set IRQ type for an SPI */ + int irq_set_spi_type(unsigned int spi, unsigned int type); + +--- sle12sp1.orig/xen/include/xen/irq.h 2015-07-08 12:33:47.000000000 +0200 ++++ sle12sp1/xen/include/xen/irq.h 2015-07-07 17:02:49.000000000 +0200 +@@ -172,4 +172,8 @@ unsigned int set_desc_affinity(struct ir + unsigned int arch_hwdom_irqs(domid_t); + #endif + ++#ifndef arch_evtchn_bind_pirq ++void arch_evtchn_bind_pirq(struct domain *, int pirq); ++#endif ++ + #endif /* __XEN_IRQ_H__ */ diff --git a/x86-PCI-CFG-write-intercept.patch b/x86-PCI-CFG-write-intercept.patch new file mode 100644 index 0000000..da4653f --- /dev/null +++ b/x86-PCI-CFG-write-intercept.patch @@ -0,0 +1,114 @@ +x86/PCI: add config space abstract write intercept logic + +This is to be used by MSI code, and later to also be hooked up to +MMCFG accesses by Dom0. + +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper + +--- sle12sp1.orig/xen/arch/x86/msi.c 2015-07-08 11:45:59.000000000 +0200 ++++ sle12sp1/xen/arch/x86/msi.c 2015-06-22 09:06:30.000000000 +0200 +@@ -1108,6 +1108,12 @@ void pci_cleanup_msi(struct pci_dev *pde + msi_free_irqs(pdev); + } + ++int pci_msi_conf_write_intercept(struct pci_dev *pdev, unsigned int reg, ++ unsigned int size, uint32_t *data) ++{ ++ return 0; ++} ++ + int pci_restore_msi_state(struct pci_dev *pdev) + { + unsigned long flags; +--- sle12sp1.orig/xen/arch/x86/pci.c 2015-07-08 11:45:59.000000000 +0200 ++++ sle12sp1/xen/arch/x86/pci.c 2015-06-19 16:08:11.000000000 +0200 +@@ -67,3 +67,28 @@ void pci_conf_write(uint32_t cf8, uint8_ + + spin_unlock_irqrestore(&pci_config_lock, flags); + } ++ ++int pci_conf_write_intercept(unsigned int seg, unsigned int bdf, ++ unsigned int reg, unsigned int size, ++ uint32_t *data) ++{ ++ struct pci_dev *pdev; ++ int rc = 0; ++ ++ /* ++ * Avoid expensive operations when no hook is going to do anything ++ * for the access anyway. ++ */ ++ if ( reg < 64 || reg >= 256 ) ++ return 0; ++ ++ spin_lock(&pcidevs_lock); ++ ++ pdev = pci_get_pdev(seg, PCI_BUS(bdf), PCI_DEVFN2(bdf)); ++ if ( pdev ) ++ rc = pci_msi_conf_write_intercept(pdev, reg, size, data); ++ ++ spin_unlock(&pcidevs_lock); ++ ++ return rc; ++} +--- sle12sp1.orig/xen/arch/x86/traps.c 2015-07-08 11:45:59.000000000 +0200 ++++ sle12sp1/xen/arch/x86/traps.c 2015-06-19 15:52:47.000000000 +0200 +@@ -1708,8 +1708,8 @@ static int admin_io_okay( + return ioports_access_permitted(v->domain, port, port + bytes - 1); + } + +-static bool_t pci_cfg_ok(struct domain *currd, bool_t write, +- unsigned int start, unsigned int size) ++static bool_t pci_cfg_ok(struct domain *currd, unsigned int start, ++ unsigned int size, uint32_t *write) + { + uint32_t machine_bdf; + +@@ -1741,8 +1741,12 @@ static bool_t pci_cfg_ok(struct domain * + start |= CF8_ADDR_HI(currd->arch.pci_cf8); + } + +- return !xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf, +- start, start + size - 1, write); ++ if ( xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf, ++ start, start + size - 1, !!write) != 0 ) ++ return 0; ++ ++ return !write || ++ pci_conf_write_intercept(0, machine_bdf, start, size, write) >= 0; + } + + uint32_t guest_io_read( +@@ -1796,7 +1800,7 @@ uint32_t guest_io_read( + size = min(bytes, 4 - (port & 3)); + if ( size == 3 ) + size = 2; +- if ( pci_cfg_ok(v->domain, 0, port & 3, size) ) ++ if ( pci_cfg_ok(v->domain, port & 3, size, NULL) ) + sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size); + } + +@@ -1869,7 +1873,7 @@ void guest_io_write( + size = min(bytes, 4 - (port & 3)); + if ( size == 3 ) + size = 2; +- if ( pci_cfg_ok(v->domain, 1, port & 3, size) ) ++ if ( pci_cfg_ok(v->domain, port & 3, size, &data) ) + pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data); + } + +--- sle12sp1.orig/xen/include/asm-x86/pci.h 2015-07-08 11:45:59.000000000 +0200 ++++ sle12sp1/xen/include/asm-x86/pci.h 2015-06-19 15:52:03.000000000 +0200 +@@ -15,4 +15,11 @@ struct arch_pci_dev { + vmask_t used_vectors; + }; + ++struct pci_dev; ++int pci_conf_write_intercept(unsigned int seg, unsigned int bdf, ++ unsigned int reg, unsigned int size, ++ uint32_t *data); ++int pci_msi_conf_write_intercept(struct pci_dev *, unsigned int reg, ++ unsigned int size, uint32_t *data); ++ + #endif /* __X86_PCI_H__ */ diff --git a/x86-pci_cfg_okay.patch b/x86-pci_cfg_okay.patch new file mode 100644 index 0000000..95ddd29 --- /dev/null +++ b/x86-pci_cfg_okay.patch @@ -0,0 +1,156 @@ +# Commit 85baced14dec2fafa9fe560969dba2ae28e8bebb +# Date 2015-06-09 15:59:31 +0200 +# Author Jan Beulich +# Committer Jan Beulich +x86: adjust PV I/O emulation functions' types + +admin_io_okay(), guest_io_read(), and guest_io_write() all don't need +their current "regs" parameter at all, and they don't use the vCPU +passed to them for other than obtaining its domain. Drop the former and +replace the latter by a struct domain pointer. + +pci_cfg_okay() returns a boolean type, and its "write" parameter is of +boolean kind too. + +All of them get called for the current vCPU (and hence current domain) +only, so name the domain parameters accordingly except in the +admin_io_okay() case, which a subsequent patch will use for simplifying +setup_io_bitmap(). + +Latch current->domain into a local variable in emulate_privileged_op(). + +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper + +# Commit 2d67a7a4d37a4759bcd7f2ee2d740497ad669c7d +# Date 2015-06-18 15:07:10 +0200 +# Author Jan Beulich +# Committer Jan Beulich +x86: synchronize PCI config space access decoding + +Both PV and HVM logic have similar but not similar enough code here. +Synchronize the two so that +- in the HVM case we don't unconditionally try to access extended + config space +- in the PV case we pass a correct range to the XSM hook +- in the PV case we don't needlessly deny access when the operation + isn't really on PCI config space +All this along with sharing the macros HVM already had here. + +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper + +Backport stripped down to just the pci_cfg_ok() adjustments. + +--- sle12sp1.orig/xen/arch/x86/traps.c 2015-07-08 14:13:16.000000000 +0200 ++++ sle12sp1/xen/arch/x86/traps.c 2015-07-08 11:43:22.000000000 +0200 +@@ -1708,14 +1708,18 @@ static int admin_io_okay( + return ioports_access_permitted(v->domain, port, port + bytes - 1); + } + +-static int pci_cfg_ok(struct domain *d, int write, int size) ++static bool_t pci_cfg_ok(struct domain *currd, bool_t write, ++ unsigned int start, unsigned int size) + { + uint32_t machine_bdf; +- uint16_t start, end; +- if (!is_hardware_domain(d)) ++ ++ if ( !is_hardware_domain(currd) ) + return 0; + +- machine_bdf = (d->arch.pci_cf8 >> 8) & 0xFFFF; ++ if ( !CF8_ENABLED(currd->arch.pci_cf8) ) ++ return 1; ++ ++ machine_bdf = CF8_BDF(currd->arch.pci_cf8); + if ( write ) + { + const unsigned long *ro_map = pci_get_ro_map(0); +@@ -1723,9 +1727,9 @@ static int pci_cfg_ok(struct domain *d, + if ( ro_map && test_bit(machine_bdf, ro_map) ) + return 0; + } +- start = d->arch.pci_cf8 & 0xFF; ++ start |= CF8_ADDR_LO(currd->arch.pci_cf8); + /* AMD extended configuration space access? */ +- if ( (d->arch.pci_cf8 & 0x0F000000) && ++ if ( CF8_ADDR_HI(currd->arch.pci_cf8) && + boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 <= 0x17 ) + { +@@ -1734,12 +1738,11 @@ static int pci_cfg_ok(struct domain *d, + if ( rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) ) + return 0; + if ( msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT) ) +- start |= (d->arch.pci_cf8 >> 16) & 0xF00; ++ start |= CF8_ADDR_HI(currd->arch.pci_cf8); + } +- end = start + size - 1; +- if (xsm_pci_config_permission(XSM_HOOK, d, machine_bdf, start, end, write)) +- return 0; +- return 1; ++ ++ return !xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf, ++ start, start + size - 1, write); + } + + uint32_t guest_io_read( +@@ -1793,7 +1796,7 @@ uint32_t guest_io_read( + size = min(bytes, 4 - (port & 3)); + if ( size == 3 ) + size = 2; +- if ( pci_cfg_ok(v->domain, 0, size) ) ++ if ( pci_cfg_ok(v->domain, 0, port & 3, size) ) + sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size); + } + +@@ -1866,7 +1869,7 @@ void guest_io_write( + size = min(bytes, 4 - (port & 3)); + if ( size == 3 ) + size = 2; +- if ( pci_cfg_ok(v->domain, 1, size) ) ++ if ( pci_cfg_ok(v->domain, 1, port & 3, size) ) + pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data); + } + +--- sle12sp1.orig/xen/arch/x86/hvm/hvm.c 2015-07-08 14:13:38.000000000 +0200 ++++ sle12sp1/xen/arch/x86/hvm/hvm.c 2015-07-08 11:43:22.000000000 +0200 +@@ -2356,11 +2356,6 @@ void hvm_vcpu_down(struct vcpu *v) + static struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d, + ioreq_t *p) + { +-#define CF8_BDF(cf8) (((cf8) & 0x00ffff00) >> 8) +-#define CF8_ADDR_LO(cf8) ((cf8) & 0x000000fc) +-#define CF8_ADDR_HI(cf8) (((cf8) & 0x0f000000) >> 16) +-#define CF8_ENABLED(cf8) (!!((cf8) & 0x80000000)) +- + struct hvm_ioreq_server *s; + uint32_t cf8; + uint8_t type; +@@ -2445,11 +2440,6 @@ static struct hvm_ioreq_server *hvm_sele + } + + return d->arch.hvm_domain.default_ioreq_server; +- +-#undef CF8_ADDR_ENABLED +-#undef CF8_ADDR_HI +-#undef CF8_ADDR_LO +-#undef CF8_BDF + } + + int hvm_buffered_io_send(ioreq_t *p) +--- sle12sp1.orig/xen/include/asm-x86/pci.h 2015-07-08 14:13:16.000000000 +0200 ++++ sle12sp1/xen/include/asm-x86/pci.h 2015-07-08 11:43:22.000000000 +0200 +@@ -1,6 +1,11 @@ + #ifndef __X86_PCI_H__ + #define __X86_PCI_H__ + ++#define CF8_BDF(cf8) ( ((cf8) & 0x00ffff00) >> 8) ++#define CF8_ADDR_LO(cf8) ( (cf8) & 0x000000fc) ++#define CF8_ADDR_HI(cf8) ( ((cf8) & 0x0f000000) >> 16) ++#define CF8_ENABLED(cf8) (!!((cf8) & 0x80000000)) ++ + #define IS_SNB_GFX(id) (id == 0x01068086 || id == 0x01168086 \ + || id == 0x01268086 || id == 0x01028086 \ + || id == 0x01128086 || id == 0x01228086 \ diff --git a/xen.changes b/xen.changes index b18d488..ad76df0 100644 --- a/xen.changes +++ b/xen.changes @@ -1,3 +1,53 @@ +------------------------------------------------------------------- +Wed Jul 8 11:38:26 MDT 2015 - carnold@suse.com + +- bnc#935634 - VUL-0: CVE-2015-3259: xen: XSA-137: xl command line + config handling stack overflow + CVE-2015-3259-xsa137.patch +- Upstream patches from Jan + 558bfaa0-x86-traps-avoid-using-current-too-early.patch + 5592a116-nested-EPT-fix-the-handling-of-nested-EPT.patch + 559b9dd6-x86-p2m-ept-don-t-unmap-in-use-EPT-pagetable.patch + 559bdde5-pull-in-latest-linux-earlycpio.patch +- Upstream patches from Jan pending review + 552d0fd2-x86-hvm-don-t-include-asm-spinlock-h.patch + 552d0fe8-x86-mtrr-include-asm-atomic.h.patch + 552d293b-x86-vMSI-X-honor-all-mask-requests.patch + 552d2966-x86-vMSI-X-add-valid-bits-for-read-acceleration.patch + 554c7aee-x86-provide-arch_fetch_and_add.patch + 554c7b00-arm-provide-arch_fetch_and_add.patch + 55534b0a-x86-provide-add_sized.patch + 55534b25-arm-provide-add_sized.patch + 5555a4f8-use-ticket-locks-for-spin-locks.patch + 5555a5b9-x86-arm-remove-asm-spinlock-h.patch + 5555a8ec-introduce-non-contiguous-allocation.patch + 55795a52-x86-vMSI-X-support-qword-MMIO-access.patch + 557eb55f-gnttab-per-active-entry-locking.patch + 557eb5b6-gnttab-introduce-maptrack-lock.patch + 557eb620-gnttab-make-the-grant-table-lock-a-read-write-lock.patch + 557ffab8-evtchn-factor-out-freeing-an-event-channel.patch + 5582bf43-evtchn-simplify-port_is_valid.patch + 5582bf81-evtchn-remove-the-locking-when-unmasking-an-event-channel.patch + 5583d9c5-x86-MSI-X-cleanup.patch + 5583da09-x86-MSI-track-host-and-guest-masking-separately.patch + 5583da64-gnttab-use-per-VCPU-maptrack-free-lists.patch + 5583da8c-gnttab-steal-maptrack-entries-from-other-VCPUs.patch + 5587d711-evtchn-clear-xen_consumer-when-clearing-state.patch + 5587d779-evtchn-defer-freeing-struct-evtchn-s-until-evtchn_destroy_final.patch + 5587d7b7-evtchn-use-a-per-event-channel-lock-for-sending-events.patch + 5587d7e2-evtchn-pad-struct-evtchn-to-64-bytes.patch + x86-MSI-pv-unmask.patch + x86-pci_cfg_okay.patch + x86-PCI-CFG-write-intercept.patch + x86-MSI-X-maskall.patch + x86-MSI-X-teardown.patch + x86-MSI-X-enable.patch + x86-MSI-mask.patch +- Dropped + qemu-MSI-X-enable-maskall.patch + qemu-MSI-X-latch-writes.patch + x86-MSI-X-guest-mask.patch + ------------------------------------------------------------------- Tue Jun 30 08:25:35 MDT 2015 - carnold@suse.com diff --git a/xen.spec b/xen.spec index e55349c..5478004 100644 --- a/xen.spec +++ b/xen.spec @@ -90,6 +90,7 @@ BuildRequires: dev86 #!BuildIgnore: gcc-PIE BuildRequires: bison BuildRequires: fdupes +BuildRequires: figlet BuildRequires: flex BuildRequires: glib2-devel BuildRequires: libaio-devel @@ -201,13 +202,20 @@ Source99: baselibs.conf # http://xenbits.xensource.com/ext/xenalyze Source20000: xenalyze.hg.tar.bz2 # Upstream patches -Patch1: 551ac326-xentop-add-support-for-qdisk.patch -Patch2: 5548e903-domctl-don-t-truncate-XEN_DOMCTL_max_mem-requests.patch -Patch3: 554cc211-libxl-add-qxl.patch -Patch4: 556d973f-unmodified-drivers-tolerate-IRQF_DISABLED-being-undefined.patch -Patch5: 5576f178-kexec-add-more-pages-to-v1-environment.patch -Patch6: 55780be1-x86-EFI-adjust-EFI_MEMORY_WP-handling-for-spec-version-2.5.patch +Patch1: 55103616-vm-assist-prepare-for-discontiguous-used-bit-numbers.patch +Patch2: 551ac326-xentop-add-support-for-qdisk.patch +Patch3: 5548e903-domctl-don-t-truncate-XEN_DOMCTL_max_mem-requests.patch +Patch4: 5548e95d-x86-allow-to-suppress-M2P-user-mode-exposure.patch +Patch5: 554cc211-libxl-add-qxl.patch +Patch6: 556d973f-unmodified-drivers-tolerate-IRQF_DISABLED-being-undefined.patch +Patch7: 5576f178-kexec-add-more-pages-to-v1-environment.patch +Patch8: 55780be1-x86-EFI-adjust-EFI_MEMORY_WP-handling-for-spec-version-2.5.patch +Patch9: 558bfaa0-x86-traps-avoid-using-current-too-early.patch +Patch10: 5592a116-nested-EPT-fix-the-handling-of-nested-EPT.patch +Patch11: 559b9dd6-x86-p2m-ept-don-t-unmap-in-use-EPT-pagetable.patch +Patch12: 559bdde5-pull-in-latest-linux-earlycpio.patch Patch131: CVE-2015-4106-xsa131-9.patch +Patch137: CVE-2015-3259-xsa137.patch # Upstream qemu Patch250: VNC-Support-for-ExtendedKeyEvent-client-message.patch Patch251: 0001-net-move-the-tap-buffer-into-TAPState.patch @@ -218,15 +226,6 @@ Patch255: 0005-e1000-multi-buffer-packet-support.patch Patch256: 0006-e1000-clear-EOP-for-multi-buffer-descriptors.patch Patch257: 0007-e1000-verify-we-have-buffers-upfront.patch Patch258: 0008-e1000-check-buffer-availability.patch -# Extra patches pending review -Patch150: 55103616-vm-assist-prepare-for-discontiguous-used-bit-numbers.patch -Patch151: 5548e95d-x86-allow-to-suppress-M2P-user-mode-exposure.patch -Patch156: x86-MSI-X-teardown.patch -Patch157: x86-MSI-X-enable.patch -Patch158: x86-MSI-X-guest-mask.patch -Patch159: x86-MSI-X-maskall.patch -Patch160: qemu-MSI-X-latch-writes.patch -Patch161: qemu-MSI-X-enable-maskall.patch # Our platform specific patches Patch301: xen-destdir.patch Patch302: vif-bridge-no-iptables.patch @@ -309,6 +308,40 @@ Patch605: xen.build-compare.vgabios.patch Patch606: xen.build-compare.seabios.patch Patch607: xen.build-compare.man.patch Patch608: ipxe-no-error-logical-not-parentheses.patch +# Extra patches pending review +Patch801: 552d0fd2-x86-hvm-don-t-include-asm-spinlock-h.patch +Patch802: 552d0fe8-x86-mtrr-include-asm-atomic.h.patch +Patch803: 552d293b-x86-vMSI-X-honor-all-mask-requests.patch +Patch804: 552d2966-x86-vMSI-X-add-valid-bits-for-read-acceleration.patch +Patch805: 554c7aee-x86-provide-arch_fetch_and_add.patch +Patch806: 554c7b00-arm-provide-arch_fetch_and_add.patch +Patch807: 55534b0a-x86-provide-add_sized.patch +Patch808: 55534b25-arm-provide-add_sized.patch +Patch809: 5555a4f8-use-ticket-locks-for-spin-locks.patch +Patch810: 5555a5b9-x86-arm-remove-asm-spinlock-h.patch +Patch811: 5555a8ec-introduce-non-contiguous-allocation.patch +Patch812: 55795a52-x86-vMSI-X-support-qword-MMIO-access.patch +Patch813: 557eb55f-gnttab-per-active-entry-locking.patch +Patch814: 557eb5b6-gnttab-introduce-maptrack-lock.patch +Patch815: 557eb620-gnttab-make-the-grant-table-lock-a-read-write-lock.patch +Patch816: 557ffab8-evtchn-factor-out-freeing-an-event-channel.patch +Patch817: 5582bf43-evtchn-simplify-port_is_valid.patch +Patch818: 5582bf81-evtchn-remove-the-locking-when-unmasking-an-event-channel.patch +Patch819: 5583d9c5-x86-MSI-X-cleanup.patch +Patch820: 5583da09-x86-MSI-track-host-and-guest-masking-separately.patch +Patch821: 5583da64-gnttab-use-per-VCPU-maptrack-free-lists.patch +Patch822: 5583da8c-gnttab-steal-maptrack-entries-from-other-VCPUs.patch +Patch823: 5587d711-evtchn-clear-xen_consumer-when-clearing-state.patch +Patch824: 5587d779-evtchn-defer-freeing-struct-evtchn-s-until-evtchn_destroy_final.patch +Patch825: 5587d7b7-evtchn-use-a-per-event-channel-lock-for-sending-events.patch +Patch826: 5587d7e2-evtchn-pad-struct-evtchn-to-64-bytes.patch +Patch850: x86-MSI-pv-unmask.patch +Patch851: x86-pci_cfg_okay.patch +Patch852: x86-PCI-CFG-write-intercept.patch +Patch853: x86-MSI-X-maskall.patch +Patch854: x86-MSI-X-teardown.patch +Patch855: x86-MSI-X-enable.patch +Patch856: x86-MSI-mask.patch # Build patches Patch99996: xen.stubdom.newlib.patch Patch99998: tmp_build.patch @@ -521,7 +554,14 @@ Authors: %patch4 -p1 %patch5 -p1 %patch6 -p1 +%patch7 -p1 +%patch8 -p1 +%patch9 -p1 +%patch10 -p1 +%patch11 -p1 +%patch12 -p1 %patch131 -p1 +%patch137 -p1 # Upstream qemu patches %patch250 -p1 %patch251 -p1 @@ -532,15 +572,6 @@ Authors: %patch256 -p1 %patch257 -p1 %patch258 -p1 -# Extra patches pending review -%patch150 -p1 -%patch151 -p1 -%patch156 -p1 -%patch157 -p1 -%patch158 -p1 -%patch159 -p1 -%patch160 -p1 -%patch161 -p1 # Our platform specific patches %patch301 -p1 %patch302 -p1 @@ -622,6 +653,40 @@ Authors: %patch606 -p1 %patch607 -p1 %patch608 -p1 +# Extra patches pending review +%patch801 -p1 +%patch802 -p1 +%patch803 -p1 +%patch804 -p1 +%patch805 -p1 +%patch806 -p1 +%patch807 -p1 +%patch808 -p1 +%patch809 -p1 +%patch810 -p1 +%patch811 -p1 +%patch812 -p1 +%patch813 -p1 +%patch814 -p1 +%patch815 -p1 +%patch816 -p1 +%patch817 -p1 +%patch818 -p1 +%patch819 -p1 +%patch820 -p1 +%patch821 -p1 +%patch822 -p1 +%patch823 -p1 +%patch824 -p1 +%patch825 -p1 +%patch826 -p1 +%patch850 -p1 +%patch851 -p1 +%patch852 -p1 +%patch853 -p1 +%patch854 -p1 +%patch855 -p1 +%patch856 -p1 # Build patches %patch99996 -p1 %patch99998 -p1