- add xorg-x11-util-devel to BuildRequires to get lndir(1)

- remove xen.migrate.tools_notify_restore_to_hangup_during_migration_--abort_if_busy.patch
  It changed migration protocol and upstream wants a different solution

- bnc#802221 - fix xenpaging
  readd xenpaging.qemu.flush-cache.patch

- Upstream patches from Jan
  26891-x86-S3-Fix-cpu-pool-scheduling-after-suspend-resume.patch
  26930-x86-EFI-fix-runtime-call-status-for-compat-mode-Dom0.patch
- Additional fix for bnc#816159
  CVE-2013-1918-xsa45-followup.patch

- bnc#817068 - Xen guest with >1 sr-iov vf won't start
  xen-managed-pci-device.patch

- Update to Xen 4.2.2 c/s 26064
  The following recent security patches are included in the tarball
  CVE-2013-0151-xsa34.patch (bnc#797285)
  CVE-2012-6075-xsa41.patch (bnc#797523)
  CVE-2013-1917-xsa44.patch (bnc#813673)
  CVE-2013-1919-xsa46.patch (bnc#813675)

- Upstream patch from Jan
  26902-x86-EFI-pass-boot-services-variable-info-to-runtime-code.patch 

- bnc#816159 - VUL-0: xen: CVE-2013-1918: XSA-45: Several long
  latency operations are not preemptible
  CVE-2013-1918-xsa45-1-vcpu-destroy-pagetables-preemptible.patch

OBS-URL: https://build.opensuse.org/package/show/Virtualization/xen?expand=0&rev=237
This commit is contained in:
Olaf Hering 2013-05-07 14:35:00 +00:00 committed by Git OBS Bridge
parent 9c5584a232
commit b9d38dfc8d
119 changed files with 2932 additions and 5587 deletions

View File

@ -12,10 +12,10 @@ the fixmaps together with other boot time page table construction.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Keir Fraser <keir@xen.org>
Index: xen-4.2.0-testing/xen/arch/x86/boot/head.S
Index: xen-4.2.2-testing/xen/arch/x86/boot/head.S
===================================================================
--- xen-4.2.0-testing.orig/xen/arch/x86/boot/head.S
+++ xen-4.2.0-testing/xen/arch/x86/boot/head.S
--- xen-4.2.2-testing.orig/xen/arch/x86/boot/head.S
+++ xen-4.2.2-testing/xen/arch/x86/boot/head.S
@@ -3,6 +3,7 @@
#include <public/xen.h>
#include <asm/asm_defns.h>
@ -57,10 +57,10 @@ Index: xen-4.2.0-testing/xen/arch/x86/boot/head.S
#endif
/* Initialize 4kB mappings of first 2MB or 4MB of memory. */
Index: xen-4.2.0-testing/xen/arch/x86/efi/boot.c
Index: xen-4.2.2-testing/xen/arch/x86/efi/boot.c
===================================================================
--- xen-4.2.0-testing.orig/xen/arch/x86/efi/boot.c
+++ xen-4.2.0-testing/xen/arch/x86/efi/boot.c
--- xen-4.2.2-testing.orig/xen/arch/x86/efi/boot.c
+++ xen-4.2.2-testing/xen/arch/x86/efi/boot.c
@@ -17,6 +17,9 @@
#include <xen/vga.h>
#include <asm/e820.h>
@ -92,11 +92,11 @@ Index: xen-4.2.0-testing/xen/arch/x86/efi/boot.c
/* Initialise L3 boot-map page directory entries. */
l3_bootmap[l3_table_offset(xen_phys_start)] =
l3e_from_paddr((UINTN)l2_bootmap, __PAGE_HYPERVISOR);
Index: xen-4.2.0-testing/xen/arch/x86/mm.c
Index: xen-4.2.2-testing/xen/arch/x86/mm.c
===================================================================
--- xen-4.2.0-testing.orig/xen/arch/x86/mm.c
+++ xen-4.2.0-testing/xen/arch/x86/mm.c
@@ -130,6 +130,10 @@
--- xen-4.2.2-testing.orig/xen/arch/x86/mm.c
+++ xen-4.2.2-testing/xen/arch/x86/mm.c
@@ -131,6 +131,10 @@
l1_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
l1_identmap[L1_PAGETABLE_ENTRIES];
@ -107,10 +107,10 @@ Index: xen-4.2.0-testing/xen/arch/x86/mm.c
#define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
/*
Index: xen-4.2.0-testing/xen/arch/x86/x86_64/mm.c
Index: xen-4.2.2-testing/xen/arch/x86/x86_64/mm.c
===================================================================
--- xen-4.2.0-testing.orig/xen/arch/x86/x86_64/mm.c
+++ xen-4.2.0-testing/xen/arch/x86/x86_64/mm.c
--- xen-4.2.2-testing.orig/xen/arch/x86/x86_64/mm.c
+++ xen-4.2.2-testing/xen/arch/x86/x86_64/mm.c
@@ -65,6 +65,10 @@ l3_pgentry_t __attribute__ ((__section__
l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
l2_xenmap[L2_PAGETABLE_ENTRIES];
@ -122,10 +122,10 @@ Index: xen-4.2.0-testing/xen/arch/x86/x86_64/mm.c
/* Enough page directories to map into the bottom 1GB. */
l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
l3_bootmap[L3_PAGETABLE_ENTRIES];
Index: xen-4.2.0-testing/xen/include/asm-x86/config.h
Index: xen-4.2.2-testing/xen/include/asm-x86/config.h
===================================================================
--- xen-4.2.0-testing.orig/xen/include/asm-x86/config.h
+++ xen-4.2.0-testing/xen/include/asm-x86/config.h
--- xen-4.2.2-testing.orig/xen/include/asm-x86/config.h
+++ xen-4.2.2-testing/xen/include/asm-x86/config.h
@@ -317,7 +317,7 @@ extern unsigned char boot_edid_info[128]
#define MACHPHYS_MBYTES 16 /* 1 MB needed per 1 GB memory */
#define FRAMETABLE_MBYTES (MACHPHYS_MBYTES * 6)
@ -135,10 +135,10 @@ Index: xen-4.2.0-testing/xen/include/asm-x86/config.h
#define IOREMAP_VIRT_START (IOREMAP_VIRT_END - (IOREMAP_MBYTES<<20))
#define DIRECTMAP_VIRT_END IOREMAP_VIRT_START
#define DIRECTMAP_VIRT_START (DIRECTMAP_VIRT_END - (DIRECTMAP_MBYTES<<20))
Index: xen-4.2.0-testing/xen/include/asm-x86/fixmap.h
Index: xen-4.2.2-testing/xen/include/asm-x86/fixmap.h
===================================================================
--- xen-4.2.0-testing.orig/xen/include/asm-x86/fixmap.h
+++ xen-4.2.0-testing/xen/include/asm-x86/fixmap.h
--- xen-4.2.2-testing.orig/xen/include/asm-x86/fixmap.h
+++ xen-4.2.2-testing/xen/include/asm-x86/fixmap.h
@@ -13,12 +13,17 @@
#define _ASM_FIXMAP_H
@ -158,7 +158,7 @@ Index: xen-4.2.0-testing/xen/include/asm-x86/fixmap.h
#include <asm/amd-iommu.h>
#include <asm/msi.h>
#include <acpi/apei.h>
@@ -66,7 +71,6 @@ enum fixed_addresses {
@@ -68,7 +73,6 @@ enum fixed_addresses {
__end_of_fixed_addresses
};
@ -166,17 +166,17 @@ Index: xen-4.2.0-testing/xen/include/asm-x86/fixmap.h
#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
@@ -90,4 +94,6 @@ static inline unsigned long virt_to_fix(
@@ -92,4 +96,6 @@ static inline unsigned long virt_to_fix(
return __virt_to_fix(vaddr);
}
+#endif /* __ASSEMBLY__ */
+
#endif
Index: xen-4.2.0-testing/xen/include/asm-x86/page.h
Index: xen-4.2.2-testing/xen/include/asm-x86/page.h
===================================================================
--- xen-4.2.0-testing.orig/xen/include/asm-x86/page.h
+++ xen-4.2.0-testing/xen/include/asm-x86/page.h
--- xen-4.2.2-testing.orig/xen/include/asm-x86/page.h
+++ xen-4.2.2-testing/xen/include/asm-x86/page.h
@@ -1,6 +1,8 @@
#ifndef __X86_PAGE_H__
#define __X86_PAGE_H__
@ -203,10 +203,10 @@ Index: xen-4.2.0-testing/xen/include/asm-x86/page.h
void paging_init(void);
void setup_idle_pagetable(void);
#endif /* !defined(__ASSEMBLY__) */
Index: xen-4.2.0-testing/xen/include/xen/const.h
Index: xen-4.2.2-testing/xen/include/xen/const.h
===================================================================
--- /dev/null
+++ xen-4.2.0-testing/xen/include/xen/const.h
+++ xen-4.2.2-testing/xen/include/xen/const.h
@@ -0,0 +1,24 @@
+/* const.h: Macros for dealing with constants. */
+

View File

@ -14,8 +14,10 @@ Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Keir Fraser <keir@xen.org>
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
Index: xen-4.2.2-testing/docs/misc/xen-command-line.markdown
===================================================================
--- xen-4.2.2-testing.orig/docs/misc/xen-command-line.markdown
+++ xen-4.2.2-testing/docs/misc/xen-command-line.markdown
@@ -244,7 +244,7 @@ A typical setup for most situations migh
Specify the size of the console ring buffer.
@ -47,8 +49,10 @@ Acked-by: Keir Fraser <keir@xen.org>
### debug\_stack\_lines
> `= <integer>`
--- a/xen/arch/x86/Rules.mk
+++ b/xen/arch/x86/Rules.mk
Index: xen-4.2.2-testing/xen/arch/x86/Rules.mk
===================================================================
--- xen-4.2.2-testing.orig/xen/arch/x86/Rules.mk
+++ xen-4.2.2-testing/xen/arch/x86/Rules.mk
@@ -7,6 +7,7 @@ HAS_CPUFREQ := y
HAS_PCI := y
HAS_PASSTHROUGH := y
@ -57,8 +61,10 @@ Acked-by: Keir Fraser <keir@xen.org>
HAS_KEXEC := y
HAS_GDBSX := y
xenoprof := y
--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
Index: xen-4.2.2-testing/xen/arch/x86/physdev.c
===================================================================
--- xen-4.2.2-testing.orig/xen/arch/x86/physdev.c
+++ xen-4.2.2-testing/xen/arch/x86/physdev.c
@@ -8,6 +8,7 @@
#include <xen/event.h>
#include <xen/guest_access.h>
@ -67,7 +73,7 @@ Acked-by: Keir Fraser <keir@xen.org>
#include <asm/current.h>
#include <asm/io_apic.h>
#include <asm/msi.h>
@@ -722,6 +723,19 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
@@ -734,6 +735,19 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
break;
}
@ -87,8 +93,10 @@ Acked-by: Keir Fraser <keir@xen.org>
default:
ret = -ENOSYS;
break;
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
Index: xen-4.2.2-testing/xen/arch/x86/setup.c
===================================================================
--- xen-4.2.2-testing.orig/xen/arch/x86/setup.c
+++ xen-4.2.2-testing/xen/arch/x86/setup.c
@@ -606,6 +606,7 @@ void __init __start_xen(unsigned long mb
ns16550.io_base = 0x2f8;
ns16550.irq = 3;
@ -97,16 +105,20 @@ Acked-by: Keir Fraser <keir@xen.org>
console_init_preirq();
printk("Bootloader: %s\n", loader);
--- a/xen/drivers/char/Makefile
+++ b/xen/drivers/char/Makefile
Index: xen-4.2.2-testing/xen/drivers/char/Makefile
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/char/Makefile
+++ xen-4.2.2-testing/xen/drivers/char/Makefile
@@ -1,4 +1,5 @@
obj-y += console.o
obj-$(HAS_NS16550) += ns16550.o
obj-$(HAS_PL011) += pl011.o
+obj-$(HAS_EHCI) += ehci-dbgp.o
obj-y += serial.o
Index: xen-4.2.2-testing/xen/drivers/char/ehci-dbgp.c
===================================================================
--- /dev/null
+++ b/xen/drivers/char/ehci-dbgp.c
+++ xen-4.2.2-testing/xen/drivers/char/ehci-dbgp.c
@@ -0,0 +1,1577 @@
+/*
+ * Standalone EHCI USB debug driver
@ -1685,8 +1697,10 @@ Acked-by: Keir Fraser <keir@xen.org>
+
+ return -ENOSYS;
+}
--- a/xen/drivers/char/serial.c
+++ b/xen/drivers/char/serial.c
Index: xen-4.2.2-testing/xen/drivers/char/serial.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/char/serial.c
+++ xen-4.2.2-testing/xen/drivers/char/serial.c
@@ -265,6 +265,14 @@ int __init serial_parse_handle(char *con
{
int handle;
@ -1702,8 +1716,10 @@ Acked-by: Keir Fraser <keir@xen.org>
if ( strncmp(conf, "com", 3) )
goto fail;
--- a/xen/include/asm-x86/fixmap.h
+++ b/xen/include/asm-x86/fixmap.h
Index: xen-4.2.2-testing/xen/include/asm-x86/fixmap.h
===================================================================
--- xen-4.2.2-testing.orig/xen/include/asm-x86/fixmap.h
+++ xen-4.2.2-testing/xen/include/asm-x86/fixmap.h
@@ -36,7 +36,15 @@
* from the end of virtual memory backwards.
*/
@ -1721,9 +1737,11 @@ Acked-by: Keir Fraser <keir@xen.org>
#ifdef __i386__
FIX_PAE_HIGHMEM_0,
FIX_PAE_HIGHMEM_END = FIX_PAE_HIGHMEM_0 + NR_CPUS-1,
--- a/xen/include/public/physdev.h
+++ b/xen/include/public/physdev.h
@@ -312,6 +312,24 @@ struct physdev_pci_device {
Index: xen-4.2.2-testing/xen/include/public/physdev.h
===================================================================
--- xen-4.2.2-testing.orig/xen/include/public/physdev.h
+++ xen-4.2.2-testing/xen/include/public/physdev.h
@@ -318,6 +318,24 @@ struct physdev_pci_device {
typedef struct physdev_pci_device physdev_pci_device_t;
DEFINE_XEN_GUEST_HANDLE(physdev_pci_device_t);
@ -1748,8 +1766,10 @@ Acked-by: Keir Fraser <keir@xen.org>
/*
* Notify that some PIRQ-bound event channels have been unmasked.
* ** This command is obsolete since interface version 0x00030202 and is **
--- a/xen/include/xen/serial.h
+++ b/xen/include/xen/serial.h
Index: xen-4.2.2-testing/xen/include/xen/serial.h
===================================================================
--- xen-4.2.2-testing.orig/xen/include/xen/serial.h
+++ xen-4.2.2-testing/xen/include/xen/serial.h
@@ -69,9 +69,10 @@ struct uart_driver {
};

View File

@ -1,146 +0,0 @@
No functional change.
The purpose is to make it easier to backport patches from Xen 4.3's
libxl, as Xen 4.3's libxl has had this done:
libxl: Enable -Wshadow.
It was convenient to invent $(CFLAGS_LIBXL) to do this.
Various renamings to avoid shadowing standard functions:
- index(3)
- listen(2)
- link(2)
- abort(3)
- abs(3)
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
In this patch we do not change the others, and we do not enable
-Wshadow. We're just trying to bring 4.2's libxl textually closer to
4.3's.
Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
---
tools/libxl/libxl_event.c | 34 +++++++++++++++++-----------------
1 files changed, 17 insertions(+), 17 deletions(-)
Index: xen-4.2.1-testing/tools/libxl/libxl_event.c
===================================================================
--- xen-4.2.1-testing.orig/tools/libxl/libxl_event.c
+++ xen-4.2.1-testing/tools/libxl/libxl_event.c
@@ -167,15 +167,15 @@ static void time_insert_finite(libxl__gc
}
static int time_register_finite(libxl__gc *gc, libxl__ev_time *ev,
- struct timeval abs)
+ struct timeval absolute)
{
int rc;
- rc = OSEVENT_HOOK(timeout_register, &ev->for_app_reg, abs, ev);
+ rc = OSEVENT_HOOK(timeout_register, &ev->for_app_reg, absolute, ev);
if (rc) return rc;
ev->infinite = 0;
- ev->abs = abs;
+ ev->abs = absolute;
time_insert_finite(gc, ev);
return 0;
@@ -202,16 +202,16 @@ static void time_done_debug(libxl__gc *g
int libxl__ev_time_register_abs(libxl__gc *gc, libxl__ev_time *ev,
libxl__ev_time_callback *func,
- struct timeval abs)
+ struct timeval absolute)
{
int rc;
CTX_LOCK;
DBG("ev_time=%p register abs=%lu.%06lu",
- ev, (unsigned long)abs.tv_sec, (unsigned long)abs.tv_usec);
+ ev, (unsigned long)absolute.tv_sec, (unsigned long)absolute.tv_usec);
- rc = time_register_finite(gc, ev, abs);
+ rc = time_register_finite(gc, ev, absolute);
if (rc) goto out;
ev->func = func;
@@ -228,7 +228,7 @@ int libxl__ev_time_register_rel(libxl__g
libxl__ev_time_callback *func,
int milliseconds /* as for poll(2) */)
{
- struct timeval abs;
+ struct timeval absolute;
int rc;
CTX_LOCK;
@@ -238,10 +238,10 @@ int libxl__ev_time_register_rel(libxl__g
if (milliseconds < 0) {
ev->infinite = 1;
} else {
- rc = time_rel_to_abs(gc, milliseconds, &abs);
+ rc = time_rel_to_abs(gc, milliseconds, &absolute);
if (rc) goto out;
- rc = time_register_finite(gc, ev, abs);
+ rc = time_register_finite(gc, ev, absolute);
if (rc) goto out;
}
@@ -255,26 +255,26 @@ int libxl__ev_time_register_rel(libxl__g
}
int libxl__ev_time_modify_abs(libxl__gc *gc, libxl__ev_time *ev,
- struct timeval abs)
+ struct timeval absolute)
{
int rc;
CTX_LOCK;
DBG("ev_time=%p modify abs==%lu.%06lu",
- ev, (unsigned long)abs.tv_sec, (unsigned long)abs.tv_usec);
+ ev, (unsigned long)absolute.tv_sec, (unsigned long)absolute.tv_usec);
assert(libxl__ev_time_isregistered(ev));
if (ev->infinite) {
- rc = time_register_finite(gc, ev, abs);
+ rc = time_register_finite(gc, ev, absolute);
if (rc) goto out;
} else {
- rc = OSEVENT_HOOK(timeout_modify, &ev->for_app_reg, abs);
+ rc = OSEVENT_HOOK(timeout_modify, &ev->for_app_reg, absolute);
if (rc) goto out;
LIBXL_TAILQ_REMOVE(&CTX->etimes, ev, entry);
- ev->abs = abs;
+ ev->abs = absolute;
time_insert_finite(gc, ev);
}
@@ -288,7 +288,7 @@ int libxl__ev_time_modify_abs(libxl__gc
int libxl__ev_time_modify_rel(libxl__gc *gc, libxl__ev_time *ev,
int milliseconds)
{
- struct timeval abs;
+ struct timeval absolute;
int rc;
CTX_LOCK;
@@ -304,10 +304,10 @@ int libxl__ev_time_modify_rel(libxl__gc
goto out;
}
- rc = time_rel_to_abs(gc, milliseconds, &abs);
+ rc = time_rel_to_abs(gc, milliseconds, &absolute);
if (rc) goto out;
- rc = libxl__ev_time_modify_abs(gc, ev, abs);
+ rc = libxl__ev_time_modify_abs(gc, ev, absolute);
if (rc) goto out;
rc = 0;

View File

@ -17,16 +17,14 @@ Signed-off-by: Jiongxi Li <jiongxi.li@intel.com>
--- a/xen/arch/x86/hvm/vlapic.c
+++ b/xen/arch/x86/hvm/vlapic.c
@@ -823,6 +823,14 @@ static int vlapic_write(struct vcpu *v,
@@ -822,6 +822,12 @@ static int vlapic_write(struct vcpu *v,
return rc;
}
+int vlapic_apicv_write(struct vcpu *v, unsigned int offset)
+{
+ uint32_t val = vlapic_get_reg(vcpu_vlapic(v), offset);
+
+ vlapic_reg_write(v, offset, val);
+ return 0;
+ return vlapic_reg_write(v, offset, val);
+}
+
int hvm_x2apic_msr_write(struct vcpu *v, unsigned int msr, uint64_t msr_content)
@ -59,7 +57,7 @@ Signed-off-by: Jiongxi Li <jiongxi.li@intel.com>
MSR_IA32_VMX_PROCBASED_CTLS2, &mismatch);
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2274,6 +2274,16 @@ static void vmx_idtv_reinject(unsigned l
@@ -2279,6 +2279,16 @@ static void vmx_idtv_reinject(unsigned l
}
}
@ -76,7 +74,7 @@ Signed-off-by: Jiongxi Li <jiongxi.li@intel.com>
void vmx_vmexit_handler(struct cpu_user_regs *regs)
{
unsigned int exit_reason, idtv_info, intr_info = 0, vector = 0;
@@ -2729,6 +2739,11 @@ void vmx_vmexit_handler(struct cpu_user_
@@ -2741,6 +2751,11 @@ void vmx_vmexit_handler(struct cpu_user_
break;
}

View File

@ -51,7 +51,7 @@ Committed-by: Keir Fraser <keir@xen.org>
int vlapic_ipi(
struct vlapic *vlapic, uint32_t icr_low, uint32_t icr_high)
{
@@ -1000,6 +1011,14 @@ void vlapic_adjust_i8259_target(struct d
@@ -996,6 +1007,14 @@ void vlapic_adjust_i8259_target(struct d
pt_adjust_global_vcpu_target(v);
}
@ -66,7 +66,7 @@ Committed-by: Keir Fraser <keir@xen.org>
int vlapic_has_pending_irq(struct vcpu *v)
{
struct vlapic *vlapic = vcpu_vlapic(v);
@@ -1012,6 +1031,9 @@ int vlapic_has_pending_irq(struct vcpu *
@@ -1008,6 +1027,9 @@ int vlapic_has_pending_irq(struct vcpu *
if ( irr == -1 )
return -1;
@ -76,7 +76,7 @@ Committed-by: Keir Fraser <keir@xen.org>
isr = vlapic_find_highest_isr(vlapic);
isr = (isr != -1) ? isr : 0;
if ( (isr & 0xf0) >= (irr & 0xf0) )
@@ -1024,6 +1046,9 @@ int vlapic_ack_pending_irq(struct vcpu *
@@ -1020,6 +1042,9 @@ int vlapic_ack_pending_irq(struct vcpu *
{
struct vlapic *vlapic = vcpu_vlapic(v);
@ -88,7 +88,7 @@ Committed-by: Keir Fraser <keir@xen.org>
--- a/xen/arch/x86/hvm/vmx/intr.c
+++ b/xen/arch/x86/hvm/vmx/intr.c
@@ -206,6 +206,7 @@ void vmx_intr_assist(void)
@@ -209,6 +209,7 @@ void vmx_intr_assist(void)
struct vcpu *v = current;
unsigned int tpr_threshold = 0;
enum hvm_intblk intblk;
@ -96,7 +96,7 @@ Committed-by: Keir Fraser <keir@xen.org>
/* Block event injection when single step with MTF. */
if ( unlikely(v->arch.hvm_vcpu.single_step) )
@@ -216,7 +217,7 @@ void vmx_intr_assist(void)
@@ -219,7 +220,7 @@ void vmx_intr_assist(void)
}
/* Crank the handle on interrupt state. */
@ -105,7 +105,7 @@ Committed-by: Keir Fraser <keir@xen.org>
do {
intack = hvm_vcpu_has_pending_irq(v);
@@ -227,16 +228,34 @@ void vmx_intr_assist(void)
@@ -230,16 +231,34 @@ void vmx_intr_assist(void)
goto out;
intblk = hvm_interrupt_blocked(v, intack);
@ -145,7 +145,7 @@ Committed-by: Keir Fraser <keir@xen.org>
{
enable_intr_window(v, intack);
goto out;
@@ -253,6 +272,44 @@ void vmx_intr_assist(void)
@@ -256,6 +275,44 @@ void vmx_intr_assist(void)
{
hvm_inject_hw_exception(TRAP_machine_check, HVM_DELIVER_NO_ERROR_CODE);
}
@ -190,7 +190,7 @@ Committed-by: Keir Fraser <keir@xen.org>
else
{
HVMTRACE_2D(INJ_VIRQ, intack.vector, /*fake=*/ 0);
@@ -262,11 +319,16 @@ void vmx_intr_assist(void)
@@ -265,11 +322,16 @@ void vmx_intr_assist(void)
/* Is there another IRQ to queue up behind this one? */
intack = hvm_vcpu_has_pending_irq(v);
@ -291,7 +291,7 @@ Committed-by: Keir Fraser <keir@xen.org>
struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -1502,6 +1502,22 @@ static void vmx_set_info_guest(struct vc
@@ -1507,6 +1507,22 @@ static void vmx_set_info_guest(struct vc
vmx_vmcs_exit(v);
}
@ -314,7 +314,7 @@ Committed-by: Keir Fraser <keir@xen.org>
static struct hvm_function_table __read_mostly vmx_function_table = {
.name = "VMX",
.cpu_up_prepare = vmx_cpu_up_prepare,
@@ -1548,7 +1564,9 @@ static struct hvm_function_table __read_
@@ -1553,7 +1569,9 @@ static struct hvm_function_table __read_
.nhvm_vmcx_guest_intercepts_trap = nvmx_intercepts_exception,
.nhvm_vcpu_vmexit_trap = nvmx_vmexit_trap,
.nhvm_intr_blocked = nvmx_intr_blocked,
@ -325,7 +325,7 @@ Committed-by: Keir Fraser <keir@xen.org>
};
struct hvm_function_table * __init start_vmx(void)
@@ -2284,6 +2302,17 @@ static int vmx_handle_apic_write(void)
@@ -2289,6 +2307,17 @@ static int vmx_handle_apic_write(void)
return vlapic_apicv_write(current, offset);
}
@ -343,7 +343,7 @@ Committed-by: Keir Fraser <keir@xen.org>
void vmx_vmexit_handler(struct cpu_user_regs *regs)
{
unsigned int exit_reason, idtv_info, intr_info = 0, vector = 0;
@@ -2677,6 +2706,16 @@ void vmx_vmexit_handler(struct cpu_user_
@@ -2689,6 +2718,16 @@ void vmx_vmexit_handler(struct cpu_user_
hvm_inject_hw_exception(TRAP_gp_fault, 0);
break;

View File

@ -16,8 +16,10 @@ corresponding x2apic MSRs:
Signed-off-by: Jiongxi Li <jiongxi.li@intel.com>
Committed-by: Keir Fraser <keir@xen.org>
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
Index: xen-4.2.2-testing/xen/arch/x86/hvm/vmx/vmcs.c
===================================================================
--- xen-4.2.2-testing.orig/xen/arch/x86/hvm/vmx/vmcs.c
+++ xen-4.2.2-testing/xen/arch/x86/hvm/vmx/vmcs.c
@@ -658,7 +658,7 @@ static void vmx_set_host_env(struct vcpu
(unsigned long)&get_cpu_info()->guest_cpu_user_regs.error_code);
}
@ -82,9 +84,11 @@ Committed-by: Keir Fraser <keir@xen.org>
}
/* I/O access bitmap. */
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2036,7 +2036,7 @@ static int vmx_msr_write_intercept(unsig
Index: xen-4.2.2-testing/xen/arch/x86/hvm/vmx/vmx.c
===================================================================
--- xen-4.2.2-testing.orig/xen/arch/x86/hvm/vmx/vmx.c
+++ xen-4.2.2-testing/xen/arch/x86/hvm/vmx/vmx.c
@@ -2041,7 +2041,7 @@ static int vmx_msr_write_intercept(unsig
for ( ; (rc == 0) && lbr->count; lbr++ )
for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
if ( (rc = vmx_add_guest_msr(lbr->base + i)) == 0 )
@ -93,8 +97,10 @@ Committed-by: Keir Fraser <keir@xen.org>
}
if ( (rc < 0) ||
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
Index: xen-4.2.2-testing/xen/include/asm-x86/hvm/vmx/vmcs.h
===================================================================
--- xen-4.2.2-testing.orig/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ xen-4.2.2-testing/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -407,7 +407,9 @@ enum vmcs_field {
#define VMCS_VPID_WIDTH 16
@ -106,9 +112,11 @@ Committed-by: Keir Fraser <keir@xen.org>
int vmx_read_guest_msr(u32 msr, u64 *val);
int vmx_write_guest_msr(u32 msr, u64 val);
int vmx_add_guest_msr(u32 msr);
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -291,6 +291,9 @@
Index: xen-4.2.2-testing/xen/include/asm-x86/msr-index.h
===================================================================
--- xen-4.2.2-testing.orig/xen/include/asm-x86/msr-index.h
+++ xen-4.2.2-testing/xen/include/asm-x86/msr-index.h
@@ -293,6 +293,9 @@
#define MSR_IA32_APICBASE_ENABLE (1<<11)
#define MSR_IA32_APICBASE_BASE (0xfffff<<12)
#define MSR_IA32_APICBASE_MSR 0x800

View File

@ -17,11 +17,11 @@ domain's permission is sufficient.
Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov>
Committed-by: Jan Beulich <jbeulich@suse.com>
Index: xen-4.2.0-testing/xen/arch/x86/mm.c
Index: xen-4.2.2-testing/xen/arch/x86/mm.c
===================================================================
--- xen-4.2.0-testing.orig/xen/arch/x86/mm.c
+++ xen-4.2.0-testing/xen/arch/x86/mm.c
@@ -883,6 +883,19 @@ get_page_from_l1e(
--- xen-4.2.2-testing.orig/xen/arch/x86/mm.c
+++ xen-4.2.2-testing/xen/arch/x86/mm.c
@@ -884,6 +884,19 @@ get_page_from_l1e(
return -EINVAL;
}

View File

@ -24,10 +24,10 @@ would be happy when sync tsc.
Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>
Committed-by: Jan Beulich <jbeulich@suse.com>
Index: xen-4.2.0-testing/xen/arch/x86/hvm/hvm.c
Index: xen-4.2.2-testing/xen/arch/x86/hvm/hvm.c
===================================================================
--- xen-4.2.0-testing.orig/xen/arch/x86/hvm/hvm.c
+++ xen-4.2.0-testing/xen/arch/x86/hvm/hvm.c
--- xen-4.2.2-testing.orig/xen/arch/x86/hvm/hvm.c
+++ xen-4.2.2-testing/xen/arch/x86/hvm/hvm.c
@@ -244,6 +244,7 @@ int hvm_set_guest_pat(struct vcpu *v, u6
void hvm_set_guest_tsc(struct vcpu *v, u64 guest_tsc)
{
@ -103,10 +103,10 @@ Index: xen-4.2.0-testing/xen/arch/x86/hvm/hvm.c
paging_update_paging_modes(v);
v->arch.flags |= TF_kernel_mode;
Index: xen-4.2.0-testing/xen/include/asm-x86/hvm/vcpu.h
Index: xen-4.2.2-testing/xen/include/asm-x86/hvm/vcpu.h
===================================================================
--- xen-4.2.0-testing.orig/xen/include/asm-x86/hvm/vcpu.h
+++ xen-4.2.0-testing/xen/include/asm-x86/hvm/vcpu.h
--- xen-4.2.2-testing.orig/xen/include/asm-x86/hvm/vcpu.h
+++ xen-4.2.2-testing/xen/include/asm-x86/hvm/vcpu.h
@@ -137,6 +137,7 @@ struct hvm_vcpu {
struct hvm_vcpu_asid n1asid;
@ -115,11 +115,11 @@ Index: xen-4.2.0-testing/xen/include/asm-x86/hvm/vcpu.h
/* VPMU */
struct vpmu_struct vpmu;
Index: xen-4.2.0-testing/xen/include/asm-x86/msr-index.h
Index: xen-4.2.2-testing/xen/include/asm-x86/msr-index.h
===================================================================
--- xen-4.2.0-testing.orig/xen/include/asm-x86/msr-index.h
+++ xen-4.2.0-testing/xen/include/asm-x86/msr-index.h
@@ -284,6 +284,7 @@
--- xen-4.2.2-testing.orig/xen/include/asm-x86/msr-index.h
+++ xen-4.2.2-testing/xen/include/asm-x86/msr-index.h
@@ -286,6 +286,7 @@
#define MSR_IA32_PLATFORM_ID 0x00000017
#define MSR_IA32_EBL_CR_POWERON 0x0000002a
#define MSR_IA32_EBC_FREQUENCY_ID 0x0000002c

View File

@ -10,8 +10,10 @@ And some initial Haswell ones at once.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: "Nakajima, Jun" <jun.nakajima@intel.com>
--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
Index: xen-4.2.2-testing/xen/arch/x86/acpi/cpu_idle.c
===================================================================
--- xen-4.2.2-testing.orig/xen/arch/x86/acpi/cpu_idle.c
+++ xen-4.2.2-testing/xen/arch/x86/acpi/cpu_idle.c
@@ -105,11 +105,15 @@ static void do_get_hw_residencies(void *
switch ( c->x86_model )
@ -30,9 +32,11 @@ Acked-by: "Nakajima, Jun" <jun.nakajima@intel.com>
GET_PC2_RES(hw_res->pc2);
GET_CC7_RES(hw_res->cc7);
/* fall through */
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -1820,7 +1820,9 @@ static const struct lbr_info *last_branc
Index: xen-4.2.2-testing/xen/arch/x86/hvm/vmx/vmx.c
===================================================================
--- xen-4.2.2-testing.orig/xen/arch/x86/hvm/vmx/vmx.c
+++ xen-4.2.2-testing/xen/arch/x86/hvm/vmx/vmx.c
@@ -1825,7 +1825,9 @@ static const struct lbr_info *last_branc
/* Sandy Bridge */
case 42: case 45:
/* Ivy Bridge */
@ -43,9 +47,11 @@ Acked-by: "Nakajima, Jun" <jun.nakajima@intel.com>
return nh_lbr;
break;
/* Atom */
--- a/xen/arch/x86/hvm/vmx/vpmu_core2.c
+++ b/xen/arch/x86/hvm/vmx/vpmu_core2.c
@@ -747,6 +747,7 @@ int vmx_vpmu_initialise(struct vcpu *v,
Index: xen-4.2.2-testing/xen/arch/x86/hvm/vmx/vpmu_core2.c
===================================================================
--- xen-4.2.2-testing.orig/xen/arch/x86/hvm/vmx/vpmu_core2.c
+++ xen-4.2.2-testing/xen/arch/x86/hvm/vmx/vpmu_core2.c
@@ -747,6 +747,7 @@ int vmx_vpmu_initialise(struct vcpu *v,
case 46:
case 47:
case 58:

View File

@ -1,67 +0,0 @@
References: bnc#785211
# HG changeset patch
# User Huang Ying <ying.huang@intel.com>
# Date 1350401196 -7200
# Node ID 4fc87c2f31a02c770655518c9e4d389302564f00
# Parent c1c549c4fe9ebdc460cbf51e296edad157b6e518
ACPI: fix APEI related table size checking
On Huang Ying's machine:
erst_tab->header_length == sizeof(struct acpi_table_einj)
but Yinghai reported that on his machine,
erst_tab->header_length == sizeof(struct acpi_table_einj) -
sizeof(struct acpi_table_header)
To make erst table size checking code works on all systems, both
testing are treated as PASS.
Same situation applies to einj_tab->header_length, so corresponding
table size checking is changed in similar way too.
Originally-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Huang Ying <ying.huang@intel.com>
- use switch() for better readability
- add comment explaining why a formally invalid size it also being
accepted
- check erst_tab->header.length before even looking at
erst_tab->header_length
- prefer sizeof(*erst_tab) over sizeof(struct acpi_table_erst)
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Keir Fraser <keir@xen.org>
Committed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/drivers/acpi/apei/erst.c
+++ b/xen/drivers/acpi/apei/erst.c
@@ -715,12 +715,23 @@ int erst_clear(u64 record_id)
static int __init erst_check_table(struct acpi_table_erst *erst_tab)
{
- if (erst_tab->header_length != sizeof(struct acpi_table_erst))
+ if (erst_tab->header.length < sizeof(*erst_tab))
return -EINVAL;
- if (erst_tab->header.length < sizeof(struct acpi_table_erst))
+
+ switch (erst_tab->header_length) {
+ case sizeof(*erst_tab) - sizeof(erst_tab->header):
+ /*
+ * While invalid per specification, there are (early?) systems
+ * indicating the full header size here, so accept that value too.
+ */
+ case sizeof(*erst_tab):
+ break;
+ default:
return -EINVAL;
+ }
+
if (erst_tab->entries !=
- (erst_tab->header.length - sizeof(struct acpi_table_erst)) /
+ (erst_tab->header.length - sizeof(*erst_tab)) /
sizeof(struct acpi_erst_entry))
return -EINVAL;

View File

@ -1,95 +0,0 @@
References: bnc#785211
# HG changeset patch
# User Huang Ying <ying.huang@intel.com>
# Date 1350475926 -7200
# Node ID ec8a091efcce717584b00ce76e3cec40a6247ebc
# Parent 4b4c0c7a6031820ab521fdd6764cb0df157f44bf
ACPI/APEI: fix ERST MOVE_DATA instruction implementation
The src_base and dst_base fields in apei_exec_context are physical
address, so they should be ioremaped before being used in ERST
MOVE_DATA instruction.
Reported-by: Javier Martinez Canillas <martinez.javier@gmail.com>
Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Replace use of ioremap() by __acpi_map_table()/set_fixmap(). Fix error
handling.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Keir Fraser <keir@xen.org>
Committed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/drivers/acpi/apei/erst.c
+++ b/xen/drivers/acpi/apei/erst.c
@@ -247,15 +247,64 @@ static int erst_exec_move_data(struct ap
{
int rc;
u64 offset;
+#ifdef CONFIG_X86
+ enum fixed_addresses idx;
+#endif
+ void *src, *dst;
+
+ /* ioremap does not work in interrupt context */
+ if (in_irq()) {
+ printk(KERN_WARNING
+ "MOVE_DATA cannot be used in interrupt context\n");
+ return -EBUSY;
+ }
rc = __apei_exec_read_register(entry, &offset);
if (rc)
return rc;
- memmove((void *)(unsigned long)(ctx->dst_base + offset),
- (void *)(unsigned long)(ctx->src_base + offset),
- ctx->var2);
- return 0;
+#ifdef CONFIG_X86
+ switch (ctx->var2) {
+ case 0:
+ return 0;
+ case 1 ... PAGE_SIZE:
+ break;
+ default:
+ printk(KERN_WARNING
+ "MOVE_DATA cannot be used for %#"PRIx64" bytes of data\n",
+ ctx->var2);
+ return -EOPNOTSUPP;
+ }
+
+ src = __acpi_map_table(ctx->src_base + offset, ctx->var2);
+#else
+ src = ioremap(ctx->src_base + offset, ctx->var2);
+#endif
+ if (!src)
+ return -ENOMEM;
+
+#ifdef CONFIG_X86
+ BUILD_BUG_ON(FIX_ACPI_PAGES < 4);
+ idx = virt_to_fix((unsigned long)src + 2 * PAGE_SIZE);
+ offset += ctx->dst_base;
+ dst = (void *)fix_to_virt(idx) + (offset & ~PAGE_MASK);
+ set_fixmap(idx, offset);
+ if (PFN_DOWN(offset) != PFN_DOWN(offset + ctx->var2 - 1)) {
+ idx = virt_to_fix((unsigned long)dst + PAGE_SIZE);
+ set_fixmap(idx, offset + PAGE_SIZE);
+ }
+#else
+ dst = ioremap(ctx->dst_base + offset, ctx->var2);
+#endif
+ if (dst) {
+ memmove(dst, src, ctx->var2);
+ iounmap(dst);
+ } else
+ rc = -ENOMEM;
+
+ iounmap(src);
+
+ return rc;
}
static struct apei_exec_ins_type erst_ins_type[] = {

View File

@ -15,9 +15,11 @@ Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
Acked-by: Dario Faggioli <dario.faggioli@citrix.com>
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -564,7 +564,7 @@ static hw_irq_controller iommu_msi_type
Index: xen-4.2.2-testing/xen/drivers/passthrough/amd/iommu_init.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/amd/iommu_init.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/amd/iommu_init.c
@@ -564,7 +564,7 @@ static hw_irq_controller iommu_msi_type
static void parse_event_log_entry(struct amd_iommu *iommu, u32 entry[])
{
@ -47,9 +49,11 @@ Acked-by: Dario Faggioli <dario.faggioli@citrix.com>
}
else
{
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -214,6 +214,7 @@ static int device_assigned(u16 seg, u8 b
Index: xen-4.2.2-testing/xen/drivers/passthrough/iommu.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/iommu.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/iommu.c
@@ -218,6 +218,7 @@ static int device_assigned(u16 seg, u8 b
static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
{
struct hvm_iommu *hd = domain_hvm_iommu(d);
@ -57,7 +61,7 @@ Acked-by: Dario Faggioli <dario.faggioli@citrix.com>
int rc = 0;
if ( !iommu_enabled || !hd->platform_ops )
@@ -227,6 +228,10 @@ static int assign_device(struct domain *
@@ -231,6 +232,10 @@ static int assign_device(struct domain *
return -EXDEV;
spin_lock(&pcidevs_lock);
@ -68,7 +72,7 @@ Acked-by: Dario Faggioli <dario.faggioli@citrix.com>
if ( (rc = hd->platform_ops->assign_device(d, seg, bus, devfn)) )
goto done;
@@ -378,6 +383,8 @@ int deassign_device(struct domain *d, u1
@@ -382,6 +387,8 @@ int deassign_device(struct domain *d, u1
return ret;
}
@ -77,8 +81,10 @@ Acked-by: Dario Faggioli <dario.faggioli@citrix.com>
if ( !has_arch_pdevs(d) && need_iommu(d) )
{
d->need_iommu = 0;
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
Index: xen-4.2.2-testing/xen/drivers/passthrough/pci.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/pci.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/pci.c
@@ -637,6 +637,36 @@ int __init pci_device_detect(u16 seg, u8
return 1;
}
@ -116,8 +122,10 @@ Acked-by: Dario Faggioli <dario.faggioli@citrix.com>
/*
* scan pci devices to add all existed PCI devices to alldevs_list,
* and setup pci hierarchy in array bus2bridge.
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
Index: xen-4.2.2-testing/xen/drivers/passthrough/vtd/iommu.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/vtd/iommu.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/vtd/iommu.c
@@ -936,7 +936,7 @@ static void __do_iommu_page_fault(struct
while (1)
{
@ -144,9 +152,11 @@ Acked-by: Dario Faggioli <dario.faggioli@citrix.com>
fault_index++;
if ( fault_index > cap_num_fault_regs(iommu->cap) )
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -64,6 +64,11 @@ struct pci_dev {
Index: xen-4.2.2-testing/xen/include/xen/pci.h
===================================================================
--- xen-4.2.2-testing.orig/xen/include/xen/pci.h
+++ xen-4.2.2-testing/xen/include/xen/pci.h
@@ -65,6 +65,11 @@ struct pci_dev {
const u8 devfn;
struct pci_dev_info info;
struct arch_pci_dev arch;
@ -158,7 +168,7 @@ Acked-by: Dario Faggioli <dario.faggioli@citrix.com>
u64 vf_rlen[6];
};
@@ -106,6 +111,7 @@ void arch_pci_ro_device(int seg, int bdf
@@ -107,6 +112,7 @@ void arch_pci_ro_device(int seg, int bdf
struct pci_dev *pci_get_pdev(int seg, int bus, int devfn);
struct pci_dev *pci_get_pdev_by_domain(
struct domain *, int seg, int bus, int devfn);

View File

@ -1,88 +0,0 @@
# HG changeset patch
# User Jan Beulich <jbeulich@suse.com>
# Date 1353575003 -3600
# Node ID c139ca92edca2fab8ec95deb7fd9e4246c3fe28d
# Parent af6b72a224e99a4a516fbc2eecc06ada569304e8
x86/HPET: fix FSB interrupt masking
HPET_TN_FSB is not really suitable for masking interrupts - it merely
switches between the two delivery methods. The right way of masking is
through the HPET_TN_ENABLE bit (which really is an interrupt enable,
not a counter enable or some such). This is even more so with certain
chip sets not even allowing HPET_TN_FSB to be cleared on some of the
channels.
Further, all the setup of the channel should happen before actually
enabling the interrupt, which requires splitting legacy and FSB logic.
Finally this also fixes an S3 resume problem (HPET_TN_FSB did not get
set in hpet_broadcast_resume(), and hpet_msi_unmask() doesn't get
called from the general resume code either afaict).
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Keir Fraser <keir@xen.org>
--- a/xen/arch/x86/hpet.c
+++ b/xen/arch/x86/hpet.c
@@ -236,7 +236,7 @@ static void hpet_msi_unmask(struct irq_d
struct hpet_event_channel *ch = desc->action->dev_id;
cfg = hpet_read32(HPET_Tn_CFG(ch->idx));
- cfg |= HPET_TN_FSB;
+ cfg |= HPET_TN_ENABLE;
hpet_write32(cfg, HPET_Tn_CFG(ch->idx));
}
@@ -246,7 +246,7 @@ static void hpet_msi_mask(struct irq_des
struct hpet_event_channel *ch = desc->action->dev_id;
cfg = hpet_read32(HPET_Tn_CFG(ch->idx));
- cfg &= ~HPET_TN_FSB;
+ cfg &= ~HPET_TN_ENABLE;
hpet_write32(cfg, HPET_Tn_CFG(ch->idx));
}
@@ -319,8 +319,14 @@ static void __hpet_setup_msi_irq(struct
static int __init hpet_setup_msi_irq(unsigned int irq, struct hpet_event_channel *ch)
{
int ret;
+ u32 cfg = hpet_read32(HPET_Tn_CFG(ch->idx));
irq_desc_t *desc = irq_to_desc(irq);
+ /* set HPET Tn as oneshot */
+ cfg &= ~(HPET_TN_LEVEL | HPET_TN_PERIODIC);
+ cfg |= HPET_TN_FSB | HPET_TN_32BIT;
+ hpet_write32(cfg, HPET_Tn_CFG(ch->idx));
+
desc->handler = &hpet_msi_type;
ret = request_irq(irq, hpet_interrupt_handler, 0, "HPET", ch);
if ( ret < 0 )
@@ -541,11 +547,14 @@ void __init hpet_broadcast_init(void)
for ( i = 0; i < n; i++ )
{
- /* set HPET Tn as oneshot */
- cfg = hpet_read32(HPET_Tn_CFG(hpet_events[i].idx));
- cfg &= ~(HPET_TN_LEVEL | HPET_TN_PERIODIC);
- cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
- hpet_write32(cfg, HPET_Tn_CFG(hpet_events[i].idx));
+ if ( i == 0 && (cfg & HPET_CFG_LEGACY) )
+ {
+ /* set HPET T0 as oneshot */
+ cfg = hpet_read32(HPET_Tn_CFG(0));
+ cfg &= ~(HPET_TN_LEVEL | HPET_TN_PERIODIC);
+ cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+ hpet_write32(cfg, HPET_Tn_CFG(0));
+ }
/*
* The period is a femto seconds value. We need to calculate the scaled
@@ -602,6 +611,8 @@ void hpet_broadcast_resume(void)
cfg = hpet_read32(HPET_Tn_CFG(hpet_events[i].idx));
cfg &= ~(HPET_TN_LEVEL | HPET_TN_PERIODIC);
cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+ if ( !(hpet_events[i].flags & HPET_EVT_LEGACY) )
+ cfg |= HPET_TN_FSB;
hpet_write32(cfg, HPET_Tn_CFG(hpet_events[i].idx));
hpet_events[i].next_event = STIME_MAX;

View File

@ -1,28 +0,0 @@
# HG changeset patch
# User Jan Beulich <jbeulich@suse.com>
# Date 1354118456 -3600
# Node ID 836697b197462f89a4d296da9482d1719dcc0836
# Parent 1fce7522daa6bab9fce93b95adf592193c904097
IOMMU: imply "verbose" from "debug"
I think that generally enabling debugging code without also enabling
verbose output is rather pointless; if someone really wants this, they
can always pass e.g. "iommu=debug,no-verbose".
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Keir Fraser <keir@xen.org>
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -91,7 +91,11 @@ static void __init parse_iommu_param(cha
else if ( !strcmp(s, "intremap") )
iommu_intremap = val;
else if ( !strcmp(s, "debug") )
+ {
iommu_debug = val;
+ if ( val )
+ iommu_verbose = 1;
+ }
else if ( !strcmp(s, "amd-iommu-perdev-intremap") )
amd_iommu_perdev_intremap = val;
else if ( !strcmp(s, "dom0-passthrough") )

View File

@ -1,52 +0,0 @@
# HG changeset patch
# User Jan Beulich <jbeulich@suse.com>
# Date 1354697534 -3600
# Node ID 670b07e8d7382229639af0d1df30071e6c1ebb19
# Parent bc624b00d6d601f00a53c2f7502a82dcef60f882
IOMMU/ATS: fix maximum queue depth calculation
The capabilities register field is a 5-bit value, and the 5 bits all
being zero actually means 32 entries.
Under the assumption that amd_iommu_flush_iotlb() really just tried
to correct for the miscalculation above when adding 32 to the value,
that adjustment is also being removed.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by Xiantao Zhang <xiantao.zhang@intel.com>
Acked-by: Wei Huang <wei.huang2@amd.com>
--- a/xen/drivers/passthrough/amd/iommu_cmd.c
+++ b/xen/drivers/passthrough/amd/iommu_cmd.c
@@ -321,7 +321,7 @@ void amd_iommu_flush_iotlb(struct pci_de
req_id = get_dma_requestor_id(iommu->seg, bdf);
queueid = req_id;
- maxpend = (ats_pdev->ats_queue_depth + 32) & 0xff;
+ maxpend = ats_pdev->ats_queue_depth & 0xff;
/* send INVALIDATE_IOTLB_PAGES command */
spin_lock_irqsave(&iommu->lock, flags);
--- a/xen/drivers/passthrough/ats.h
+++ b/xen/drivers/passthrough/ats.h
@@ -30,7 +30,7 @@ struct pci_ats_dev {
#define ATS_REG_CAP 4
#define ATS_REG_CTL 6
-#define ATS_QUEUE_DEPTH_MASK 0xF
+#define ATS_QUEUE_DEPTH_MASK 0x1f
#define ATS_ENABLE (1<<15)
extern struct list_head ats_devices;
--- a/xen/drivers/passthrough/x86/ats.c
+++ b/xen/drivers/passthrough/x86/ats.c
@@ -93,7 +93,8 @@ int enable_ats_device(int seg, int bus,
pdev->devfn = devfn;
value = pci_conf_read16(seg, bus, PCI_SLOT(devfn),
PCI_FUNC(devfn), pos + ATS_REG_CAP);
- pdev->ats_queue_depth = value & ATS_QUEUE_DEPTH_MASK;
+ pdev->ats_queue_depth = value & ATS_QUEUE_DEPTH_MASK ?:
+ ATS_QUEUE_DEPTH_MASK + 1;
list_add(&pdev->list, &ats_devices);
}

View File

@ -1,28 +0,0 @@
# HG changeset patch
# User Dongxiao Xu <dongxiao.xu@intel.com>
# Date 1354812866 0
# Node ID 312f0713dfc98635fd9ed4b42481581489faa28f
# Parent bfd8e96fa3f157630f9698401a1f040ca1776c8e
nested vmx: fix rflags status in virtual vmexit
As stated in SDM, all bits (except for those 1-reserved) in rflags
would be set to 0 in VM exit. Therefore we need to follow this logic
in virtual_vmexit.
Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
Acked-by: Jan Beulich <jbeulich@suse.com>
Committed-by: Keir Fraser <keir@xen.org>
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -990,7 +990,8 @@ static void virtual_vmexit(struct cpu_us
regs->eip = __get_vvmcs(nvcpu->nv_vvmcx, HOST_RIP);
regs->esp = __get_vvmcs(nvcpu->nv_vvmcx, HOST_RSP);
- regs->eflags = __vmread(GUEST_RFLAGS);
+ /* VM exit clears all bits except bit 1 */
+ regs->eflags = 0x2;
/* updating host cr0 to sync TS bit */
__vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);

View File

@ -1,46 +0,0 @@
# HG changeset patch
# User Dongxiao Xu <dongxiao.xu@intel.com>
# Date 1354812981 0
# Node ID a09150b57ace2fa786dcaefa958f0b197b1b6d4c
# Parent 312f0713dfc98635fd9ed4b42481581489faa28f
nested vmx: fix handling of RDTSC
If L0 is to handle the TSC access, then we need to update guest EIP by
calling update_guest_eip().
Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
Acked-by: Jan Beulich <jbeulich@suse.com>
Committed-by: Keir Fraser <keir@xen.org>
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -1613,7 +1613,7 @@ static int get_instruction_length(void)
return len;
}
-static void update_guest_eip(void)
+void update_guest_eip(void)
{
struct cpu_user_regs *regs = guest_cpu_user_regs();
unsigned long x;
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -1558,6 +1558,7 @@ int nvmx_n2_vmexit_handler(struct cpu_us
tsc += __get_vvmcs(nvcpu->nv_vvmcx, TSC_OFFSET);
regs->eax = (uint32_t)tsc;
regs->edx = (uint32_t)(tsc >> 32);
+ update_guest_eip();
return 1;
}
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -396,6 +396,8 @@ void ept_p2m_init(struct p2m_domain *p2m
void ept_walk_table(struct domain *d, unsigned long gfn);
void setup_ept_dump(void);
+void update_guest_eip(void);
+
/* EPT violation qualifications definitions */
#define _EPT_READ_VIOLATION 0
#define EPT_READ_VIOLATION (1UL<<_EPT_READ_VIOLATION)

View File

@ -1,27 +0,0 @@
# HG changeset patch
# User Dongxiao Xu <dongxiao.xu@intel.com>
# Date 1354813009 0
# Node ID e6eb1e52da7cfcb1a7697b35b4d842f35107d1ed
# Parent a09150b57ace2fa786dcaefa958f0b197b1b6d4c
nested vmx: fix DR access VM exit
For DR register, we use lazy restore mechanism when access
it. Therefore when receiving such VM exit, L0 should be responsible to
switch to the right DR values, then inject to L1 hypervisor.
Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
Acked-by: Jan Beulich <jbeulich@suse.com>
Committed-by: Keir Fraser <keir@xen.org>
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -1585,7 +1585,8 @@ int nvmx_n2_vmexit_handler(struct cpu_us
break;
case EXIT_REASON_DR_ACCESS:
ctrl = __n2_exec_control(v);
- if ( ctrl & CPU_BASED_MOV_DR_EXITING )
+ if ( (ctrl & CPU_BASED_MOV_DR_EXITING) &&
+ v->arch.hvm_vcpu.flag_dr_dirty )
nvcpu->nv_vmexit_pending = 1;
break;
case EXIT_REASON_INVLPG:

View File

@ -1,30 +0,0 @@
# HG changeset patch
# User Dongxiao Xu <dongxiao.xu@intel.com>
# Date 1354813046 0
# Node ID 1ed1507fa0407f1da715d04fe1b510e81ca4fb31
# Parent e6eb1e52da7cfcb1a7697b35b4d842f35107d1ed
nested vmx: enable IA32E mode while do VM entry
Some VMMs may check the platform capability to judge whether long
mode guest is supported. Therefore we need to expose this bit to
guest VMM.
Xen on Xen works fine in current solution because Xen doesn't
check this capability but directly set it in VMCS if guest
supports long mode.
Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
Acked-by: Jan Beulich <jbeulich@suse.com>
Committed-by: Keir Fraser <keir@xen.org>
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -1351,7 +1351,7 @@ int nvmx_msr_read_intercept(unsigned int
case MSR_IA32_VMX_ENTRY_CTLS:
/* bit 0-8, and 12 must be 1 (refer G5 of SDM) */
data = 0x11ff;
- data = (data << 32) | data;
+ data = ((data | VM_ENTRY_IA32E_MODE) << 32) | data;
break;
case IA32_FEATURE_CONTROL_MSR:

View File

@ -1,45 +0,0 @@
# HG changeset patch
# User Dongxiao Xu <dongxiao.xu@intel.com>
# Date 1354813139 0
# Node ID 90831c29bfde6aac013b7e5ec98934a4953c31c9
# Parent 25dd352265ca23750f1a1a983124b36f518c4384
nested vmx: fix interrupt delivery to L2 guest
While delivering interrupt into L2 guest, L0 hypervisor need to check
whether L1 hypervisor wants to own the interrupt, if not, directly
inject the interrupt into L2 guest.
Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
Acked-by: Jan Beulich <jbeulich@suse.com>
Committed-by: Keir Fraser <keir@xen.org>
--- a/xen/arch/x86/hvm/vmx/intr.c
+++ b/xen/arch/x86/hvm/vmx/intr.c
@@ -163,7 +163,7 @@ enum hvm_intblk nvmx_intr_blocked(struct
static int nvmx_intr_intercept(struct vcpu *v, struct hvm_intack intack)
{
- u32 exit_ctrl;
+ u32 ctrl;
if ( nvmx_intr_blocked(v) != hvm_intblk_none )
{
@@ -176,11 +176,14 @@ static int nvmx_intr_intercept(struct vc
if ( intack.source == hvm_intsrc_pic ||
intack.source == hvm_intsrc_lapic )
{
+ ctrl = __get_vvmcs(vcpu_nestedhvm(v).nv_vvmcx, PIN_BASED_VM_EXEC_CONTROL);
+ if ( !(ctrl & PIN_BASED_EXT_INTR_MASK) )
+ return 0;
+
vmx_inject_extint(intack.vector);
- exit_ctrl = __get_vvmcs(vcpu_nestedhvm(v).nv_vvmcx,
- VM_EXIT_CONTROLS);
- if ( exit_ctrl & VM_EXIT_ACK_INTR_ON_EXIT )
+ ctrl = __get_vvmcs(vcpu_nestedhvm(v).nv_vvmcx, VM_EXIT_CONTROLS);
+ if ( ctrl & VM_EXIT_ACK_INTR_ON_EXIT )
{
/* for now, duplicate the ack path in vmx_intr_assist */
hvm_vcpu_ack_pending_irq(v, intack);

View File

@ -1,70 +0,0 @@
# HG changeset patch
# User Jan Beulich <jbeulich@suse.com>
# Date 1355134467 -3600
# Node ID 8d209624ea83b272e1ebd713a928c38d4782f4f1
# Parent f96a0cda12160f497981a37f6922a1ed7db9a462
scheduler: fix rate limit range checking
For one, neither of the two checks permitted for the documented value
of zero (disabling the functionality altogether).
Second, the range checking of the command line parameter was done by
the credit scheduler's initialization code, despite it being a generic
scheduler option.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Keir Fraser <keir@xen.org>
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -846,8 +846,9 @@ csched_sys_cntl(const struct scheduler *
case XEN_SYSCTL_SCHEDOP_putinfo:
if (params->tslice_ms > XEN_SYSCTL_CSCHED_TSLICE_MAX
|| params->tslice_ms < XEN_SYSCTL_CSCHED_TSLICE_MIN
- || params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX
- || params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN
+ || (params->ratelimit_us
+ && (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX
+ || params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN))
|| MICROSECS(params->ratelimit_us) > MILLISECS(params->tslice_ms) )
goto out;
prv->tslice_ms = params->tslice_ms;
@@ -1607,17 +1608,6 @@ csched_init(struct scheduler *ops)
sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS;
}
- if ( sched_ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX
- || sched_ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN )
- {
- printk("WARNING: sched_ratelimit_us outside of valid range [%d,%d].\n"
- " Resetting to default %u\n",
- XEN_SYSCTL_SCHED_RATELIMIT_MIN,
- XEN_SYSCTL_SCHED_RATELIMIT_MAX,
- SCHED_DEFAULT_RATELIMIT_US);
- sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
- }
-
prv->tslice_ms = sched_credit_tslice_ms;
prv->ticks_per_tslice = CSCHED_TICKS_PER_TSLICE;
if ( prv->tslice_ms < prv->ticks_per_tslice )
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -1322,6 +1322,18 @@ void __init scheduler_init(void)
if ( SCHED_OP(&ops, init) )
panic("scheduler returned error on init\n");
+ if ( sched_ratelimit_us &&
+ (sched_ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX
+ || sched_ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN) )
+ {
+ printk("WARNING: sched_ratelimit_us outside of valid range [%d,%d].\n"
+ " Resetting to default %u\n",
+ XEN_SYSCTL_SCHED_RATELIMIT_MIN,
+ XEN_SYSCTL_SCHED_RATELIMIT_MAX,
+ SCHED_DEFAULT_RATELIMIT_US);
+ sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
+ }
+
idle_domain = domain_create(DOMID_IDLE, 0, 0);
BUG_ON(IS_ERR(idle_domain));
idle_domain->vcpu = idle_vcpu;

View File

@ -1,82 +0,0 @@
# HG changeset patch
# User Dario Faggioli <dario.faggioli@citrix.com>
# Date 1355854218 0
# Node ID 127c2c47d440eb7f3248ab5561909e326af7e328
# Parent d5c0389bf26c89969ebce71927f34f6b923af949
xen: sched_credit: improve picking up the idle CPU for a VCPU
In _csched_cpu_pick() we try to select the best possible CPU for
running a VCPU, considering the characteristics of the underlying
hardware (i.e., how many threads, core, sockets, and how busy they
are). What we want is "the idle execution vehicle with the most
idling neighbours in its grouping".
In order to achieve it, we select a CPU from the VCPU's affinity,
giving preference to its current processor if possible, as the basis
for the comparison with all the other CPUs. Problem is, to discount
the VCPU itself when computing this "idleness" (in an attempt to be
fair wrt its current processor), we arbitrarily and unconditionally
consider that selected CPU as idle, even when it is not the case,
for instance:
1. If the CPU is not the one where the VCPU is running (perhaps due
to the affinity being changed);
2. The CPU is where the VCPU is running, but it has other VCPUs in
its runq, so it won't go idle even if the VCPU in question goes.
This is exemplified in the trace below:
] 3.466115364 x|------|------| d10v1 22005(2:2:5) 3 [ a 1 8 ]
... ... ...
3.466122856 x|------|------| d10v1 runstate_change d10v1
running->offline
3.466123046 x|------|------| d?v? runstate_change d32767v0
runnable->running
... ... ...
] 3.466126887 x|------|------| d32767v0 28004(2:8:4) 3 [ a 1 8 ]
22005(...) line (the first line) means _csched_cpu_pick() was called
on VCPU 1 of domain 10, while it is running on CPU 0, and it choose
CPU 8, which is busy ('|'), even if there are plenty of idle
CPUs. That is because, as a consequence of changing the VCPU affinity,
CPU 8 was chosen as the basis for the comparison, and therefore
considered idle (its bit gets unconditionally set in the bitmask
representing the idle CPUs). 28004(...) line means the VCPU is woken
up and queued on CPU 8's runq, where it waits for a context switch or
a migration, in order to be able to execute.
This change fixes things by only considering the "guessed" CPU idle if
the VCPU in question is both running there and is its only runnable
VCPU.
Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
Acked-by: George Dunlap <george.dunlap@citrix.com>
Committed-by: Keir Fraser <keir@xen.org>
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -72,6 +72,9 @@
#define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv)
#define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv)
#define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq))
+/* Is the first element of _cpu's runq its idle vcpu? */
+#define IS_RUNQ_IDLE(_cpu) (list_empty(RUNQ(_cpu)) || \
+ is_idle_vcpu(__runq_elem(RUNQ(_cpu)->next)->vcpu))
/*
@@ -487,9 +490,14 @@ _csched_cpu_pick(const struct scheduler
* distinct cores first and guarantees we don't do something stupid
* like run two VCPUs on co-hyperthreads while there are idle cores
* or sockets.
+ *
+ * Notice that, when computing the "idleness" of cpu, we may want to
+ * discount vc. That is, iff vc is the currently running and the only
+ * runnable vcpu on cpu, we add cpu to the idlers.
*/
cpumask_and(&idlers, &cpu_online_map, CSCHED_PRIV(ops)->idlers);
- cpumask_set_cpu(cpu, &idlers);
+ if ( vc->processor == cpu && IS_RUNQ_IDLE(cpu) )
+ cpumask_set_cpu(cpu, &idlers);
cpumask_and(&cpus, &cpus, &idlers);
cpumask_clear_cpu(cpu, &cpus);

View File

@ -1,71 +0,0 @@
# HG changeset patch
# User Andre Przywara <osp@andrep.de>
# Date 1355913729 -3600
# Node ID 5fb0b8b838dab0b331abfa675fd2b2214ac90760
# Parent b04de677de31f26ba4b8f2f382ca4dfffcff9a79
x86, amd: Disable way access filter on Piledriver CPUs
The Way Access Filter in recent AMD CPUs may hurt the performance of
some workloads, caused by aliasing issues in the L1 cache.
This patch disables it on the affected CPUs.
The issue is similar to that one of last year:
http://lkml.indiana.edu/hypermail/linux/kernel/1107.3/00041.html
This new patch does not replace the old one, we just need another
quirk for newer CPUs.
The performance penalty without the patch depends on the
circumstances, but is a bit less than the last year's 3%.
The workloads affected would be those that access code from the same
physical page under different virtual addresses, so different
processes using the same libraries with ASLR or multiple instances of
PIE-binaries. The code needs to be accessed simultaneously from both
cores of the same compute unit.
More details can be found here:
http://developer.amd.com/Assets/SharedL1InstructionCacheonAMD15hCPU.pdf
CPUs affected are anything with the core known as Piledriver.
That includes the new parts of the AMD A-Series (aka Trinity) and the
just released new CPUs of the FX-Series (aka Vishera).
The model numbering is a bit odd here: FX CPUs have model 2,
A-Series has model 10h, with possible extensions to 1Fh. Hence the
range of model ids.
Signed-off-by: Andre Przywara <osp@andrep.de>
Add and use MSR_AMD64_IC_CFG. Update the value whenever it is found to
not have all bits set, rather than just when it's zero.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Keir Fraser <keir@xen.org>
Committed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -493,6 +493,14 @@ static void __devinit init_amd(struct cp
}
}
+ /*
+ * The way access filter has a performance penalty on some workloads.
+ * Disable it on the affected CPUs.
+ */
+ if (c->x86 == 0x15 && c->x86_model >= 0x02 && c->x86_model < 0x20 &&
+ !rdmsr_safe(MSR_AMD64_IC_CFG, value) && (value & 0x1e) != 0x1e)
+ wrmsr_safe(MSR_AMD64_IC_CFG, value | 0x1e);
+
amd_get_topology(c);
/* Pointless to use MWAIT on Family10 as it does not deep sleep. */
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -206,6 +206,7 @@
/* AMD64 MSRs */
#define MSR_AMD64_NB_CFG 0xc001001f
+#define MSR_AMD64_IC_CFG 0xc0011021
#define MSR_AMD64_DC_CFG 0xc0011022
#define AMD64_NB_CFG_CF8_EXT_ENABLE_BIT 46

View File

@ -1,45 +0,0 @@
# HG changeset patch
# User Andrew Cooper <andrew.cooper3@citrix.com>
# Date 1357290407 -3600
# Node ID 8fd5635f451b073ddc99e928c975e8a7743d1321
# Parent c4114a042410d3bdec3a77c30b2e85366d7fbe1d
passthrough/domctl: use correct struct in union
This appears to be a copy paste error from c/s 23861:ec7c81fbe0de.
It is safe, functionally speaking, as both the xen_domctl_assign_device
and xen_domctl_get_device_group structure start with a 'uint32_t
machine_sbdf'. We should however use the correct union structure.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Committed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -592,7 +592,7 @@ int iommu_do_domctl(
if ( ret )
break;
- seg = domctl->u.get_device_group.machine_sbdf >> 16;
+ seg = domctl->u.assign_device.machine_sbdf >> 16;
bus = (domctl->u.assign_device.machine_sbdf >> 8) & 0xff;
devfn = domctl->u.assign_device.machine_sbdf & 0xff;
@@ -621,7 +621,7 @@ int iommu_do_domctl(
if ( ret )
goto assign_device_out;
- seg = domctl->u.get_device_group.machine_sbdf >> 16;
+ seg = domctl->u.assign_device.machine_sbdf >> 16;
bus = (domctl->u.assign_device.machine_sbdf >> 8) & 0xff;
devfn = domctl->u.assign_device.machine_sbdf & 0xff;
@@ -649,7 +649,7 @@ int iommu_do_domctl(
if ( ret )
goto deassign_device_out;
- seg = domctl->u.get_device_group.machine_sbdf >> 16;
+ seg = domctl->u.assign_device.machine_sbdf >> 16;
bus = (domctl->u.assign_device.machine_sbdf >> 8) & 0xff;
devfn = domctl->u.assign_device.machine_sbdf & 0xff;

View File

@ -12,9 +12,11 @@ IOMMU: adjust (re)assign operation parameters
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -328,34 +328,31 @@ void amd_iommu_disable_domain_device(str
Index: xen-4.2.2-testing/xen/drivers/passthrough/amd/pci_amd_iommu.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -333,34 +333,31 @@ void amd_iommu_disable_domain_device(str
disable_ats_device(iommu->seg, bus, devfn);
}
@ -59,7 +61,7 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
/* IO page tables might be destroyed after pci-detach the last device
* In this case, we have to re-allocate root table for next pci-attach.*/
@@ -364,17 +361,18 @@ static int reassign_device( struct domai
@@ -369,17 +366,18 @@ static int reassign_device( struct domai
amd_iommu_setup_domain_device(target, iommu, bdf);
AMD_IOMMU_DEBUG("Re-assign %04x:%02x:%02x.%u from dom%d to dom%d\n",
@ -83,7 +85,7 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
if ( ivrs_mappings[req_id].unity_map_enable )
{
@@ -386,7 +384,7 @@ static int amd_iommu_assign_device(struc
@@ -391,7 +389,7 @@ static int amd_iommu_assign_device(struc
ivrs_mappings[req_id].read_permission);
}
@ -92,7 +94,7 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
}
static void deallocate_next_page_table(struct page_info* pg, int level)
@@ -451,12 +449,6 @@ static void amd_iommu_domain_destroy(str
@@ -456,12 +454,6 @@ static void amd_iommu_domain_destroy(str
amd_iommu_flush_all_pages(d);
}
@ -105,7 +107,7 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
static int amd_iommu_add_device(struct pci_dev *pdev)
{
struct amd_iommu *iommu;
@@ -596,7 +588,7 @@ const struct iommu_ops amd_iommu_ops = {
@@ -601,7 +593,7 @@ const struct iommu_ops amd_iommu_ops = {
.teardown = amd_iommu_domain_destroy,
.map_page = amd_iommu_map_page,
.unmap_page = amd_iommu_unmap_page,
@ -114,8 +116,10 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
.get_device_group_id = amd_iommu_group_id,
.update_ire_from_apic = amd_iommu_ioapic_update_ire,
.update_ire_from_msi = amd_iommu_msi_msg_update_ire,
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
Index: xen-4.2.2-testing/xen/drivers/passthrough/iommu.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/iommu.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/iommu.c
@@ -232,11 +232,16 @@ static int assign_device(struct domain *
return -EXDEV;
@ -158,8 +162,10 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
if ( ret )
{
dprintk(XENLOG_ERR VTDPREFIX,
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
Index: xen-4.2.2-testing/xen/drivers/passthrough/vtd/iommu.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/vtd/iommu.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/vtd/iommu.c
@@ -1689,17 +1689,10 @@ out:
static int reassign_device_ownership(
struct domain *source,
@ -203,7 +209,7 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
return ret;
}
@@ -2207,36 +2203,26 @@ int __init intel_vtd_setup(void)
@@ -2222,36 +2218,26 @@ int __init intel_vtd_setup(void)
}
static int intel_iommu_assign_device(
@ -247,8 +253,10 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
{
ret = 0;
goto done;
--- a/xen/include/xen/iommu.h
+++ b/xen/include/xen/iommu.h
Index: xen-4.2.2-testing/xen/include/xen/iommu.h
===================================================================
--- xen-4.2.2-testing.orig/xen/include/xen/iommu.h
+++ xen-4.2.2-testing/xen/include/xen/iommu.h
@@ -123,13 +123,13 @@ struct iommu_ops {
int (*add_device)(struct pci_dev *pdev);
int (*enable_device)(struct pci_dev *pdev);

View File

@ -12,8 +12,10 @@ IOMMU: adjust add/remove operation parameters
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
Index: xen-4.2.2-testing/xen/drivers/passthrough/amd/pci_amd_iommu.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -83,14 +83,14 @@ static void disable_translation(u32 *dte
}
@ -96,7 +98,7 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
}
int __init amd_iov_detect(void)
@@ -291,16 +290,16 @@ static void __init amd_iommu_dom0_init(s
@@ -296,16 +295,16 @@ static void __init amd_iommu_dom0_init(s
}
void amd_iommu_disable_domain_device(struct domain *domain,
@ -117,7 +119,7 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
spin_lock_irqsave(&iommu->lock, flags);
@@ -308,7 +307,7 @@ void amd_iommu_disable_domain_device(str
@@ -313,7 +312,7 @@ void amd_iommu_disable_domain_device(str
{
disable_translation((u32 *)dte);
@ -126,7 +128,7 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) )
iommu_dte_set_iotlb((u32 *)dte, 0);
@@ -323,7 +322,8 @@ void amd_iommu_disable_domain_device(str
@@ -328,7 +327,8 @@ void amd_iommu_disable_domain_device(str
ASSERT(spin_is_locked(&pcidevs_lock));
@ -136,7 +138,7 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
pci_ats_enabled(iommu->seg, bus, devfn) )
disable_ats_device(iommu->seg, bus, devfn);
}
@@ -346,7 +346,7 @@ static int reassign_device(struct domain
@@ -351,7 +351,7 @@ static int reassign_device(struct domain
return -ENODEV;
}
@ -145,7 +147,7 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
if ( devfn == pdev->devfn )
{
@@ -359,7 +359,7 @@ static int reassign_device(struct domain
@@ -364,7 +364,7 @@ static int reassign_device(struct domain
if ( t->root_table == NULL )
allocate_domain_resources(t);
@ -154,7 +156,7 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
AMD_IOMMU_DEBUG("Re-assign %04x:%02x:%02x.%u from dom%d to dom%d\n",
pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
source->domain_id, target->domain_id);
@@ -449,7 +449,7 @@ static void amd_iommu_domain_destroy(str
@@ -454,7 +454,7 @@ static void amd_iommu_domain_destroy(str
amd_iommu_flush_all_pages(d);
}
@ -163,7 +165,7 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
{
struct amd_iommu *iommu;
u16 bdf;
@@ -462,16 +462,16 @@ static int amd_iommu_add_device(struct p
@@ -467,16 +467,16 @@ static int amd_iommu_add_device(struct p
{
AMD_IOMMU_DEBUG("Fail to find iommu."
" %04x:%02x:%02x.%u cannot be assigned to dom%d\n",
@ -184,7 +186,7 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
{
struct amd_iommu *iommu;
u16 bdf;
@@ -484,12 +484,12 @@ static int amd_iommu_remove_device(struc
@@ -489,12 +489,12 @@ static int amd_iommu_remove_device(struc
{
AMD_IOMMU_DEBUG("Fail to find iommu."
" %04x:%02x:%02x.%u cannot be removed from dom%d\n",
@ -200,8 +202,10 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
return 0;
}
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
Index: xen-4.2.2-testing/xen/drivers/passthrough/iommu.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/iommu.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/iommu.c
@@ -167,7 +167,7 @@ int iommu_add_device(struct pci_dev *pde
if ( !iommu_enabled || !hd->platform_ops )
return 0;
@ -220,8 +224,10 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
}
/*
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
Index: xen-4.2.2-testing/xen/drivers/passthrough/pci.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/pci.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/pci.c
@@ -715,7 +715,7 @@ int __init scan_pci_devices(void)
struct setup_dom0 {
@ -249,8 +255,10 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
{
struct setup_dom0 ctxt = { .d = d, .handler = handler };
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
Index: xen-4.2.2-testing/xen/drivers/passthrough/vtd/iommu.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/vtd/iommu.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/vtd/iommu.c
@@ -52,7 +52,7 @@ int nr_iommus;
static struct tasklet vtd_fault_tasklet;
@ -260,7 +268,7 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
static void setup_dom0_rmrr(struct domain *d);
static int domain_iommu_domid(struct domain *d,
@@ -1904,7 +1904,7 @@ static int rmrr_identity_mapping(struct
@@ -1904,7 +1904,7 @@ static int rmrr_identity_mapping(struct
return 0;
}
@ -326,8 +334,10 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
}
void clear_fault_bits(struct iommu *iommu)
--- a/xen/include/xen/iommu.h
+++ b/xen/include/xen/iommu.h
Index: xen-4.2.2-testing/xen/include/xen/iommu.h
===================================================================
--- xen-4.2.2-testing.orig/xen/include/xen/iommu.h
+++ xen-4.2.2-testing/xen/include/xen/iommu.h
@@ -120,9 +120,9 @@ bool_t pt_irq_need_timer(uint32_t flags)
struct iommu_ops {
int (*init)(struct domain *d);
@ -340,9 +350,11 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
int (*assign_device)(struct domain *, u8 devfn, struct pci_dev *);
void (*teardown)(struct domain *d);
int (*map_page)(struct domain *d, unsigned long gfn, unsigned long mfn,
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -100,7 +100,8 @@ struct pci_dev *pci_lock_pdev(int seg, i
Index: xen-4.2.2-testing/xen/include/xen/pci.h
===================================================================
--- xen-4.2.2-testing.orig/xen/include/xen/pci.h
+++ xen-4.2.2-testing/xen/include/xen/pci.h
@@ -101,7 +101,8 @@ struct pci_dev *pci_lock_pdev(int seg, i
struct pci_dev *pci_lock_domain_pdev(
struct domain *, int seg, int bus, int devfn);

View File

@ -12,8 +12,10 @@ VT-d: adjust context map/unmap parameters
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
--- a/xen/drivers/passthrough/vtd/extern.h
+++ b/xen/drivers/passthrough/vtd/extern.h
Index: xen-4.2.2-testing/xen/drivers/passthrough/vtd/extern.h
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/vtd/extern.h
+++ xen-4.2.2-testing/xen/drivers/passthrough/vtd/extern.h
@@ -95,7 +95,7 @@ void free_pgtable_maddr(u64 maddr);
void *map_vtd_domain_page(u64 maddr);
void unmap_vtd_domain_page(void *va);
@ -23,8 +25,10 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
int domain_context_unmap_one(struct domain *domain, struct iommu *iommu,
u8 bus, u8 devfn);
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
Index: xen-4.2.2-testing/xen/drivers/passthrough/vtd/iommu.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/vtd/iommu.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/vtd/iommu.c
@@ -1308,7 +1308,7 @@ static void __init intel_iommu_dom0_init
int domain_context_mapping_one(
struct domain *domain,
@ -157,7 +161,7 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
if ( ret )
{
dprintk(XENLOG_ERR VTDPREFIX, "d%d: context mapping failed\n",
@@ -1975,14 +1973,14 @@ static int intel_iommu_remove_device(u8
@@ -1975,14 +1973,14 @@ static int intel_iommu_remove_device(u8
}
}
@ -174,9 +178,11 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
if ( !err && devfn == pdev->devfn )
pci_vtd_quirk(pdev);
return err;
--- a/xen/drivers/passthrough/vtd/quirks.c
+++ b/xen/drivers/passthrough/vtd/quirks.c
@@ -292,7 +292,7 @@ static void map_me_phantom_function(stru
Index: xen-4.2.2-testing/xen/drivers/passthrough/vtd/quirks.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/vtd/quirks.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/vtd/quirks.c
@@ -319,7 +319,7 @@ static void map_me_phantom_function(stru
/* map or unmap ME phantom function */
if ( map )
domain_context_mapping_one(domain, drhd->iommu, 0,

View File

@ -18,8 +18,10 @@ how to deal with such a device, and hence shouldn't try to).
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
Index: xen-4.2.2-testing/xen/drivers/passthrough/pci.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/pci.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/pci.c
@@ -144,7 +144,7 @@ static struct pci_dev *alloc_pdev(struct
spin_lock_init(&pdev->msix_table_lock);
@ -83,8 +85,10 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
}
return pos ? DEV_TYPE_PCIe_ENDPOINT : DEV_TYPE_PCI;
--- a/xen/drivers/passthrough/vtd/intremap.c
+++ b/xen/drivers/passthrough/vtd/intremap.c
Index: xen-4.2.2-testing/xen/drivers/passthrough/vtd/intremap.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/vtd/intremap.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/vtd/intremap.c
@@ -426,7 +426,6 @@ void io_apic_write_remap_rte(
static void set_msi_source_id(struct pci_dev *pdev, struct iremap_entry *ire)
@ -112,8 +116,10 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
break;
}
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
Index: xen-4.2.2-testing/xen/drivers/passthrough/vtd/iommu.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/vtd/iommu.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/vtd/iommu.c
@@ -1450,7 +1450,6 @@ static int domain_context_mapping(
{
struct acpi_drhd_unit *drhd;
@ -168,9 +174,11 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
ret = -EINVAL;
goto out;
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -62,6 +62,17 @@ struct pci_dev {
Index: xen-4.2.2-testing/xen/include/xen/pci.h
===================================================================
--- xen-4.2.2-testing.orig/xen/include/xen/pci.h
+++ xen-4.2.2-testing/xen/include/xen/pci.h
@@ -63,6 +63,17 @@ struct pci_dev {
const u16 seg;
const u8 bus;
const u8 devfn;
@ -188,7 +196,7 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
struct pci_dev_info info;
struct arch_pci_dev arch;
struct {
@@ -83,18 +94,10 @@ struct pci_dev {
@@ -84,18 +95,10 @@ struct pci_dev {
extern spinlock_t pcidevs_lock;
@ -208,8 +216,10 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
int find_upstream_bridge(u16 seg, u8 *bus, u8 *devfn, u8 *secbus);
struct pci_dev *pci_lock_pdev(int seg, int bus, int devfn);
struct pci_dev *pci_lock_domain_pdev(
--- a/xen/include/xen/pci_regs.h
+++ b/xen/include/xen/pci_regs.h
Index: xen-4.2.2-testing/xen/include/xen/pci_regs.h
===================================================================
--- xen-4.2.2-testing.orig/xen/include/xen/pci_regs.h
+++ xen-4.2.2-testing/xen/include/xen/pci_regs.h
@@ -371,6 +371,9 @@
#define PCI_EXP_TYPE_UPSTREAM 0x5 /* Upstream Port */
#define PCI_EXP_TYPE_DOWNSTREAM 0x6 /* Downstream Port */

View File

@ -18,8 +18,10 @@ function number, would return the underlying actual device.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
--- a/xen/drivers/passthrough/amd/iommu_cmd.c
+++ b/xen/drivers/passthrough/amd/iommu_cmd.c
Index: xen-4.2.2-testing/xen/drivers/passthrough/amd/iommu_cmd.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/amd/iommu_cmd.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/amd/iommu_cmd.c
@@ -339,7 +339,15 @@ static void amd_iommu_flush_all_iotlbs(s
return;
@ -37,8 +39,10 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
}
/* Flush iommu cache after p2m changes. */
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
Index: xen-4.2.2-testing/xen/drivers/passthrough/amd/iommu_init.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/amd/iommu_init.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/amd/iommu_init.c
@@ -692,7 +692,7 @@ void parse_ppr_log_entry(struct amd_iomm
devfn = PCI_DEVFN2(device_id);
@ -48,8 +52,10 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
spin_unlock(&pcidevs_lock);
if ( pdev )
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
Index: xen-4.2.2-testing/xen/drivers/passthrough/amd/iommu_map.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/amd/iommu_map.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/amd/iommu_map.c
@@ -612,7 +612,6 @@ static int update_paging_mode(struct dom
for_each_pdev( d, pdev )
{
@ -64,25 +70,23 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
spin_lock_irqsave(&iommu->lock, flags);
- device_entry = iommu->dev_table.buffer +
- (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
-
+ do {
+ req_id = get_dma_requestor_id(pdev->seg, bdf);
+ device_entry = iommu->dev_table.buffer +
+ (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
- /* valid = 0 only works for dom0 passthrough mode */
- amd_iommu_set_root_page_table((u32 *)device_entry,
- page_to_maddr(hd->root_table),
- hd->domain_id,
- hd->paging_mode, 1);
-
- amd_iommu_flush_device(iommu, req_id);
+ do {
+ req_id = get_dma_requestor_id(pdev->seg, bdf);
+ device_entry = iommu->dev_table.buffer +
+ (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
+
+ /* valid = 0 only works for dom0 passthrough mode */
+ amd_iommu_set_root_page_table((u32 *)device_entry,
+ page_to_maddr(hd->root_table),
+ hd->domain_id,
+ hd->paging_mode, 1);
+
- amd_iommu_flush_device(iommu, req_id);
+ amd_iommu_flush_device(iommu, req_id);
+ bdf += pdev->phantom_stride;
+ } while ( PCI_DEVFN2(bdf) != pdev->devfn &&
@ -90,8 +94,10 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
spin_unlock_irqrestore(&iommu->lock, flags);
}
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
Index: xen-4.2.2-testing/xen/drivers/passthrough/iommu.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/iommu.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/iommu.c
@@ -157,6 +157,8 @@ void __init iommu_dom0_init(struct domai
int iommu_add_device(struct pci_dev *pdev)
{
@ -196,8 +202,10 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
ret = hd->platform_ops->reassign_device(d, dom0, devfn, pdev);
if ( ret )
{
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
Index: xen-4.2.2-testing/xen/drivers/passthrough/pci.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/pci.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/pci.c
@@ -146,6 +146,8 @@ static struct pci_dev *alloc_pdev(struct
/* update bus2bridge */
switch ( pdev->type = pdev_type(pseg->nr, bus, devfn) )
@ -332,9 +340,11 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
}
}
--- a/xen/include/xen/lib.h
+++ b/xen/include/xen/lib.h
@@ -58,6 +58,9 @@ do {
Index: xen-4.2.2-testing/xen/include/xen/lib.h
===================================================================
--- xen-4.2.2-testing.orig/xen/include/xen/lib.h
+++ xen-4.2.2-testing/xen/include/xen/lib.h
@@ -58,6 +58,9 @@ do {
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]) + __must_be_array(x))
@ -344,9 +354,11 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
#define reserve_bootmem(_p,_l) ((void)0)
struct domain;
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -63,6 +63,8 @@ struct pci_dev {
Index: xen-4.2.2-testing/xen/include/xen/pci.h
===================================================================
--- xen-4.2.2-testing.orig/xen/include/xen/pci.h
+++ xen-4.2.2-testing/xen/include/xen/pci.h
@@ -64,6 +64,8 @@ struct pci_dev {
const u8 bus;
const u8 devfn;
@ -355,7 +367,7 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
enum pdev_type {
DEV_TYPE_PCI_UNKNOWN,
DEV_TYPE_PCIe_ENDPOINT,
@@ -113,6 +115,7 @@ int pci_remove_device(u16 seg, u8 bus, u
@@ -114,6 +116,7 @@ int pci_remove_device(u16 seg, u8 bus, u
int pci_ro_device(int seg, int bus, int devfn);
void arch_pci_ro_device(int seg, int bdf);
struct pci_dev *pci_get_pdev(int seg, int bus, int devfn);

View File

@ -14,9 +14,11 @@ single function devices.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -672,6 +672,16 @@ Defaults to booting secondary processors
Index: xen-4.2.2-testing/docs/misc/xen-command-line.markdown
===================================================================
--- xen-4.2.2-testing.orig/docs/misc/xen-command-line.markdown
+++ xen-4.2.2-testing/docs/misc/xen-command-line.markdown
@@ -679,6 +679,16 @@ Defaults to booting secondary processors
Default: `on`
@ -33,9 +35,11 @@ Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
### ple\_gap
> `= <integer>`
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -123,6 +123,49 @@ const unsigned long *pci_get_ro_map(u16
Index: xen-4.2.2-testing/xen/drivers/passthrough/pci.c
===================================================================
--- xen-4.2.2-testing.orig/xen/drivers/passthrough/pci.c
+++ xen-4.2.2-testing/xen/drivers/passthrough/pci.c
@@ -123,6 +123,49 @@ const unsigned long *pci_get_ro_map(u16
return pseg ? pseg->ro_map : NULL;
}

View File

@ -1,30 +0,0 @@
# HG changeset patch
# User Jan Beulich <jbeulich@suse.com>
# Date 1357561709 -3600
# Node ID 8e942f2f3b45edc5bb1f7a6e05de288342426f0d
# Parent 23c4bbc0111dd807561b2c62cbc5798220943a0d
x86: compat_show_guest_stack() should not truncate MFN
Re-using "addr" here was a mistake, as it is a 32-bit quantity.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Acked-by: Keir Fraser <keir@xen.org>
--- a/xen/arch/x86/x86_64/compat/traps.c
+++ b/xen/arch/x86/x86_64/compat/traps.c
@@ -20,11 +20,12 @@ void compat_show_guest_stack(struct vcpu
if ( v != current )
{
struct vcpu *vcpu;
+ unsigned long mfn;
ASSERT(guest_kernel_mode(v, regs));
- addr = read_cr3() >> PAGE_SHIFT;
+ mfn = read_cr3() >> PAGE_SHIFT;
for_each_vcpu( v->domain, vcpu )
- if ( pagetable_get_pfn(vcpu->arch.guest_table) == addr )
+ if ( pagetable_get_pfn(vcpu->arch.guest_table) == mfn )
break;
if ( !vcpu )
{

View File

@ -1,30 +0,0 @@
References: CVE-2013-0154 XSA-37 bnc#797031
# HG changeset patch
# User Jan Beulich <jbeulich@suse.com>
# Date 1357564826 -3600
# Node ID e1facbde56ff4e5e85f9a4935abc99eb24367cd0
# Parent 8e942f2f3b45edc5bb1f7a6e05de288342426f0d
x86: fix assertion in get_page_type()
c/s 22998:e9fab50d7b61 (and immediately following ones) made it
possible that __get_page_type() returns other than -EINVAL, in
particular -EBUSY. Consequently, the assertion in get_page_type()
should check for only the return values we absolutely don't expect to
see there.
This is XSA-37 / CVE-2013-0154.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -2603,7 +2603,7 @@ int get_page_type(struct page_info *page
int rc = __get_page_type(page, type, 0);
if ( likely(rc == 0) )
return 1;
- ASSERT(rc == -EINVAL);
+ ASSERT(rc != -EINTR && rc != -EAGAIN);
return 0;
}

View File

@ -1,27 +0,0 @@
References: CVE-2012-5634 XSA-33 bnc#794316
# HG changeset patch
# User Jan Beulich <jbeulich@suse.com>
# Date 1357748006 -3600
# Node ID 19fd1237ff0dfa3d97a896d6ed6fbbd33f816a9f
# Parent 56b0d5476c11bfd09986080dfa97923586ef474f
VT-d: fix interrupt remapping source validation for devices behind legacy bridges
Using SVT_VERIFY_BUS here doesn't make sense; native Linux also
uses SVT_VERIFY_SID_SQ here instead.
This is XSA-33 / CVE-2012-5634.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/drivers/passthrough/vtd/intremap.c
+++ b/xen/drivers/passthrough/vtd/intremap.c
@@ -469,7 +469,7 @@ static void set_msi_source_id(struct pci
set_ire_sid(ire, SVT_VERIFY_BUS, SQ_ALL_16,
(bus << 8) | pdev->bus);
else if ( pdev_type(seg, bus, devfn) == DEV_TYPE_LEGACY_PCI_BRIDGE )
- set_ire_sid(ire, SVT_VERIFY_BUS, SQ_ALL_16,
+ set_ire_sid(ire, SVT_VERIFY_SID_SQ, SQ_ALL_16,
PCI_BDF2(bus, devfn));
}
break;

View File

@ -13,11 +13,11 @@ Date: Fri Jan 11 12:22:26 2013 +0000
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Committed-by: Ian Campbell <ian.campbell@citrix.com>
Index: xen-4.2.1-testing/tools/libxl/libxl.c
Index: xen-4.2.2-testing/tools/libxl/libxl.c
===================================================================
--- xen-4.2.1-testing.orig/tools/libxl/libxl.c
+++ xen-4.2.1-testing/tools/libxl/libxl.c
@@ -1727,6 +1727,26 @@ out:
--- xen-4.2.2-testing.orig/tools/libxl/libxl.c
+++ xen-4.2.2-testing/tools/libxl/libxl.c
@@ -1710,6 +1710,26 @@ out:
return;
}
@ -44,7 +44,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl.c
/******************************************************************************/
int libxl__device_disk_setdefault(libxl__gc *gc, libxl_device_disk *disk)
@@ -2563,8 +2583,7 @@ void libxl__device_nic_add(libxl__egc *e
@@ -2549,8 +2569,7 @@ void libxl__device_nic_add(libxl__egc *e
flexarray_t *front;
flexarray_t *back;
libxl__device *device;
@ -54,7 +54,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl.c
rc = libxl__device_nic_setdefault(gc, nic, domid);
if (rc) goto out;
@@ -2581,16 +2600,10 @@ void libxl__device_nic_add(libxl__egc *e
@@ -2567,17 +2586,10 @@ void libxl__device_nic_add(libxl__egc *e
}
if (nic->devid == -1) {
@ -64,7 +64,8 @@ Index: xen-4.2.1-testing/tools/libxl/libxl.c
goto out_free;
}
- if (!(l = libxl__xs_directory(gc, XBT_NULL,
- libxl__sprintf(gc, "%s/device/vif", dompath), &nb))) {
- libxl__sprintf(gc, "%s/device/vif", dompath), &nb)) ||
- nb == 0) {
- nic->devid = 0;
- } else {
- nic->devid = strtoul(l[nb - 1], NULL, 10) + 1;
@ -72,7 +73,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl.c
}
GCNEW(device);
@@ -2977,6 +2990,13 @@ int libxl__device_vkb_add(libxl__gc *gc,
@@ -2964,6 +2976,13 @@ int libxl__device_vkb_add(libxl__gc *gc,
goto out_free;
}
@ -86,7 +87,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl.c
rc = libxl__device_from_vkb(gc, domid, vkb, &device);
if (rc != 0) goto out_free;
@@ -3078,6 +3098,13 @@ int libxl__device_vfb_add(libxl__gc *gc,
@@ -3065,6 +3084,13 @@ int libxl__device_vfb_add(libxl__gc *gc,
goto out_free;
}

View File

@ -1,46 +0,0 @@
# HG changeset patch
# User Jan Beulich <jbeulich@suse.com>
# Date 1358341015 -3600
# Node ID b4cbb83f9a1f57b4f26f2d35998cda42b904ea69
# Parent 327b812026fe62a990f1d81041729c42196983ca
x86: consistently mask floating point exceptions
c/s 23142:f5e8d152a565 resulted in v->arch.fpu_ctxt to point into the
save area allocated for xsave/xrstor (when they're available). The way
vcpu_restore_fpu_lazy() works (using fpu_init() for an uninitialized
vCPU only when there's no xsave support) causes this to load whatever
arch_set_info_guest() put there, irrespective of whether the i387 state
was specified to be valid in the respective input structure.
Consequently, with a cleared (al zeroes) incoming FPU context, and with
xsave available, one gets all exceptions unmasked (as opposed to to the
legacy case, where FINIT and LDMXCSR get used, masking all exceptions).
This causes e.g. para-virtualized NetWare to crash.
The behavior of arch_set_info_guest() is thus being made more hardware-
like for the FPU portion of it: Considering it to be similar to INIT,
it will leave untouched all floating point state now. An alternative
would be to make the behavior RESET-like, forcing all state to known
values, albeit - taking into account legacy behavior - not to precisely
the values RESET would enforce (which masks only SSE exceptions, but
not x87 ones); that would come closest to mimicing FINIT behavior in
the xsave case. Another option would be to continue copying whatever
was provided, but override (at least) FCW and MXCSR if VGCF_I387_VALID
isn't set.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Keir Fraser <keir@xen.org>
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -819,7 +819,9 @@ int arch_set_info_guest(
v->arch.vgc_flags = flags;
- memcpy(v->arch.fpu_ctxt, &c.nat->fpu_ctxt, sizeof(c.nat->fpu_ctxt));
+ if ( flags & VGCF_I387_VALID )
+ memcpy(v->arch.fpu_ctxt, &c.nat->fpu_ctxt, sizeof(c.nat->fpu_ctxt));
+
if ( !compat )
{
memcpy(&v->arch.user_regs, &c.nat->user_regs, sizeof(c.nat->user_regs));

View File

@ -14,9 +14,11 @@ Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
Acked-by: Keir Fraser <keir@xen.org>
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -3357,10 +3357,10 @@ void do_nmi(struct cpu_user_regs *regs)
Index: xen-4.2.2-testing/xen/arch/x86/traps.c
===================================================================
--- xen-4.2.2-testing.orig/xen/arch/x86/traps.c
+++ xen-4.2.2-testing/xen/arch/x86/traps.c
@@ -3369,10 +3369,10 @@ void do_nmi(struct cpu_user_regs *regs)
reason = inb(0x61);
if ( reason & 0x80 )
pci_serr_error(regs);

View File

@ -1,51 +0,0 @@
# HG changeset patch
# User Boris Ostrovsky <boris.ostrovsky@amd.com>
# Date 1358508058 -3600
# Node ID 8f6dd5dc5d6cdd56050ed917a0c30903bbddcbf0
# Parent eb8e9a23925d7b77c344a4a99679a45f96754a17
x86/AMD: Enable WC+ memory type on family 10 processors
In some cases BIOS may not enable WC+ memory type on family 10 processors,
instead converting what would be WC+ memory to CD type. On guests using
nested pages this could result in performance degradation. This patch
enables WC+.
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@amd.com>
Committed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -534,6 +534,19 @@ static void __devinit init_amd(struct cp
}
#endif
+ if (c->x86 == 0x10) {
+ /*
+ * On family 10h BIOS may not have properly enabled WC+
+ * support, causing it to be converted to CD memtype. This may
+ * result in performance degradation for certain nested-paging
+ * guests. Prevent this conversion by clearing bit 24 in
+ * MSR_F10_BU_CFG2.
+ */
+ rdmsrl(MSR_F10_BU_CFG2, value);
+ value &= ~(1ULL << 24);
+ wrmsrl(MSR_F10_BU_CFG2, value);
+ }
+
/*
* Family 0x12 and above processors have APIC timer
* running in deep C states.
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -215,8 +215,9 @@
#define MSR_F10_MC4_MISC2 0xc0000409
#define MSR_F10_MC4_MISC3 0xc000040A
-/* AMD Family10h MMU control MSRs */
-#define MSR_F10_BU_CFG 0xc0011023
+/* AMD Family10h Bus Unit MSRs */
+#define MSR_F10_BU_CFG 0xc0011023
+#define MSR_F10_BU_CFG2 0xc001102a
/* Other AMD Fam10h MSRs */
#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058

View File

@ -1,38 +0,0 @@
# HG changeset patch
# User Tim Deegan <tim@xen.org>
# Date 1358508717 -3600
# Node ID 9e8c39bdc1fedd5dfc5aa7209cc5f77f813476c7
# Parent 8f6dd5dc5d6cdd56050ed917a0c30903bbddcbf0
x86/hvm: fix RTC setting.
When the guest writes one field of the RTC time, we must bring all the
other fields up to date for the current second before calculating the
new RTC time.
Signed-off-by: Tim Deegan <tim@xen.org>
Tested-by: Phil Evans <Phil.Evans@m247.com>
Committed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/hvm/rtc.c
+++ b/xen/arch/x86/hvm/rtc.c
@@ -399,10 +399,17 @@ static int rtc_ioport_write(void *opaque
case RTC_DAY_OF_MONTH:
case RTC_MONTH:
case RTC_YEAR:
- s->hw.cmos_data[s->hw.cmos_index] = data;
- /* if in set mode, do not update the time */
- if ( !(s->hw.cmos_data[RTC_REG_B] & RTC_SET) )
+ /* if in set mode, just write the register */
+ if ( (s->hw.cmos_data[RTC_REG_B] & RTC_SET) )
+ s->hw.cmos_data[s->hw.cmos_index] = data;
+ else
+ {
+ /* Fetch the current time and update just this field. */
+ s->current_tm = gmtime(get_localtime(d));
+ rtc_copy_date(s);
+ s->hw.cmos_data[s->hw.cmos_index] = data;
rtc_set_time(s);
+ }
alarm_timer_update(s);
break;
case RTC_REG_A:

View File

@ -1,72 +0,0 @@
# HG changeset patch
# User Jan Beulich <jbeulich@suse.com>
# Date 1358843590 -3600
# Node ID 5af4f2ab06f33ce441fa550333a9049c09a9ef28
# Parent 4b476378fc35e776196c29dc0e24b71529393a4c
x86: restore (optional) forwarding of PCI SERR induced NMI to Dom0
c/s 22949:54fe1011f86b removed the forwarding of NMIs to Dom0 when they
were caused by PCI SERR. NMI buttons as well as BMCs (like HP's iLO)
may however want such events to be seen in Dom0 (e.g. to trigger a
dump).
Therefore restore most of the functionality which named c/s removed
(adjusted for subsequent changes, and adjusting the public interface to
use the modern term, retaining the old one for backwards
compatibility).
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Acked-by: Keir Fraser <keir@xen.org>
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -3201,6 +3201,7 @@ static void nmi_mce_softirq(void)
static void pci_serr_softirq(void)
{
printk("\n\nNMI - PCI system error (SERR)\n");
+ outb(inb(0x61) & 0x0b, 0x61); /* re-enable the PCI SERR error line. */
}
void async_exception_cleanup(struct vcpu *curr)
@@ -3291,9 +3292,20 @@ static void pci_serr_error(struct cpu_us
{
outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable the PCI SERR error line. */
- /* Would like to print a diagnostic here but can't call printk()
- from NMI context -- raise a softirq instead. */
- raise_softirq(PCI_SERR_SOFTIRQ);
+ switch ( opt_nmi[0] )
+ {
+ case 'd': /* 'dom0' */
+ nmi_dom0_report(_XEN_NMIREASON_pci_serr);
+ case 'i': /* 'ignore' */
+ /* Would like to print a diagnostic here but can't call printk()
+ from NMI context -- raise a softirq instead. */
+ raise_softirq(PCI_SERR_SOFTIRQ);
+ break;
+ default: /* 'fatal' */
+ console_force_unlock();
+ printk("\n\nNMI - PCI system error (SERR)\n");
+ fatal_trap(TRAP_nmi, regs);
+ }
}
static void io_check_error(struct cpu_user_regs *regs)
--- a/xen/include/public/nmi.h
+++ b/xen/include/public/nmi.h
@@ -36,9 +36,14 @@
/* I/O-check error reported via ISA port 0x61, bit 6. */
#define _XEN_NMIREASON_io_error 0
#define XEN_NMIREASON_io_error (1UL << _XEN_NMIREASON_io_error)
+ /* PCI SERR reported via ISA port 0x61, bit 7. */
+#define _XEN_NMIREASON_pci_serr 1
+#define XEN_NMIREASON_pci_serr (1UL << _XEN_NMIREASON_pci_serr)
+#if __XEN_INTERFACE_VERSION__ < 0x00040300 /* legacy alias of the above */
/* Parity error reported via ISA port 0x61, bit 7. */
#define _XEN_NMIREASON_parity_error 1
#define XEN_NMIREASON_parity_error (1UL << _XEN_NMIREASON_parity_error)
+#endif
/* Unknown hardware-generated NMI. */
#define _XEN_NMIREASON_unknown 2
#define XEN_NMIREASON_unknown (1UL << _XEN_NMIREASON_unknown)

View File

@ -1,149 +0,0 @@
# HG changeset patch
# User Tomasz Wroblewski <tomasz.wroblewski@citrix.com>
# Date 1358933464 -3600
# Node ID 9efe4c0bf9c8d3ecf03868c69c24dad3218523a4
# Parent 7c6ecf2c1831a1c7f63a96f119a8891891463e54
fix acpi_dmar_zap/reinstate() (fixes S3 regression)
Fix S3 regression introduced by cs 23013:65d26504e843 (ACPI: large
cleanup). The dmar virtual pointer returned from acpi_get_table cannot
be safely stored away and used later, as the underlying
acpi_os_map_memory / __acpi_map_table functions overwrite the mapping
causing it to point to different tables than dmar (last fetched table is
used). This subsequently causes acpi_dmar_reinstate() and
acpi_dmar_zap() to write data to wrong table, causing its corruption and
problems with consecutive s3 resumes.
Added a new function to fetch ACPI table physical address, and
establishing separate static mapping for dmar_table pointer instead of
using acpi_get_table().
Signed-off-by: Tomasz Wroblewski <tomasz.wroblewski@citrix.com>
Added call to acpi_tb_verify_table(). Fixed page count passed to
map_pages_to_xen(). Cosmetic changes.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Committed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/drivers/acpi/tables/tbxface.c
+++ b/xen/drivers/acpi/tables/tbxface.c
@@ -205,3 +205,51 @@ acpi_get_table(char *signature,
return (AE_NOT_FOUND);
}
+
+/******************************************************************************
+ *
+ * FUNCTION: acpi_get_table_phys
+ *
+ * PARAMETERS: signature - ACPI signature of needed table
+ * instance - Which instance (for SSDTs)
+ * addr - Where the table's physical address is returned
+ * len - Where the length of table is returned
+ *
+ * RETURN: Status, pointer and length of table
+ *
+ * DESCRIPTION: Finds physical address and length of ACPI table
+ *
+ *****************************************************************************/
+acpi_status __init
+acpi_get_table_phys(acpi_string signature, acpi_native_uint instance,
+ acpi_physical_address *addr, acpi_native_uint *len)
+{
+ acpi_native_uint i, j;
+ acpi_status status;
+
+ if (!signature || !addr || !len)
+ return AE_BAD_PARAMETER;
+
+ for (i = j = 0; i < acpi_gbl_root_table_list.count; i++) {
+ if (!ACPI_COMPARE_NAME(
+ &acpi_gbl_root_table_list.tables[i].signature,
+ signature))
+ continue;
+
+ if (++j < instance)
+ continue;
+
+ status =
+ acpi_tb_verify_table(&acpi_gbl_root_table_list.tables[i]);
+ if (ACPI_SUCCESS(status)) {
+ *addr = acpi_gbl_root_table_list.tables[i].address;
+ *len = acpi_gbl_root_table_list.tables[i].length;
+ }
+
+ acpi_gbl_root_table_list.tables[i].pointer = NULL;
+
+ return status;
+ }
+
+ return AE_NOT_FOUND;
+}
--- a/xen/drivers/passthrough/vtd/dmar.c
+++ b/xen/drivers/passthrough/vtd/dmar.c
@@ -776,6 +776,7 @@ out:
}
#ifdef CONFIG_X86
+#include <asm/fixmap.h>
#include <asm/tboot.h>
/* ACPI tables may not be DMA protected by tboot, so use DMAR copy */
/* SINIT saved in SinitMleData in TXT heap (which is DMA protected) */
@@ -786,7 +787,32 @@ out:
int __init acpi_dmar_init(void)
{
- acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_table);
+ acpi_physical_address dmar_addr;
+ acpi_native_uint dmar_len;
+
+ if ( ACPI_SUCCESS(acpi_get_table_phys(ACPI_SIG_DMAR, 0,
+ &dmar_addr, &dmar_len)) )
+ {
+#ifdef CONFIG_X86_32
+ if ( dmar_addr + dmar_len > (DIRECTMAP_MBYTES << 20) )
+ {
+ unsigned long offset = dmar_addr & (PAGE_SIZE - 1);
+ unsigned long mapped_size = PAGE_SIZE - offset;
+
+ set_fixmap(FIX_DMAR_ZAP_LO, dmar_addr);
+ if ( mapped_size < sizeof(*dmar_table) )
+ set_fixmap(FIX_DMAR_ZAP_HI, dmar_addr + PAGE_SIZE);
+ dmar_table = (void *)fix_to_virt(FIX_DMAR_ZAP_LO) + offset;
+ goto exit;
+ }
+#endif
+ map_pages_to_xen((unsigned long)__va(dmar_addr), PFN_DOWN(dmar_addr),
+ PFN_UP(dmar_addr + dmar_len) - PFN_DOWN(dmar_addr),
+ PAGE_HYPERVISOR);
+ dmar_table = __va(dmar_addr);
+ }
+
+ exit: __attribute__((__unused__))
return parse_dmar_table(acpi_parse_dmar);
}
--- a/xen/include/acpi/acpixf.h
+++ b/xen/include/acpi/acpixf.h
@@ -77,6 +77,9 @@ acpi_status
acpi_get_table(acpi_string signature,
acpi_native_uint instance, struct acpi_table_header **out_table);
+acpi_status
+acpi_get_table_phys(acpi_string signature, acpi_native_uint instance,
+ acpi_physical_address *addr, acpi_native_uint *len);
/*
* Namespace and name interfaces
*/
--- a/xen/include/asm-x86/fixmap.h
+++ b/xen/include/asm-x86/fixmap.h
@@ -50,6 +50,8 @@ enum fixed_addresses {
FIX_PAE_HIGHMEM_END = FIX_PAE_HIGHMEM_0 + NR_CPUS-1,
#define FIX_VGC_END FIX_PAE_HIGHMEM_0
#define FIX_VGC_BEGIN FIX_PAE_HIGHMEM_END
+ FIX_DMAR_ZAP_HI,
+ FIX_DMAR_ZAP_LO,
#else
FIX_VGC_END,
FIX_VGC_BEGIN = FIX_VGC_END

View File

@ -1,32 +0,0 @@
References: CVE-2013-0152 XSA-35 bnc#797287
# HG changeset patch
# User Ian Campbell <ian.campbell@citrix.com>
# Date 1358938044 -3600
# Node ID 621b1a889e9b120236698731e0b5ecc5b0cb1d82
# Parent 9efe4c0bf9c8d3ecf03868c69c24dad3218523a4
xen: Do not allow guests to enable nested HVM on themselves
There is no reason for this and doing so exposes a memory leak to
guests. Only toolstacks need write access to this HVM param.
This is XSA-35 / CVE-2013-0152.
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Acked-by: Jan Beulich <JBeulich@suse.com>
Committed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3930,6 +3930,11 @@ long do_hvm_op(unsigned long op, XEN_GUE
rc = -EINVAL;
break;
case HVM_PARAM_NESTEDHVM:
+ if ( !IS_PRIV(current->domain) )
+ {
+ rc = -EPERM;
+ break;
+ }
#ifdef __i386__
if ( a.value )
rc = -EINVAL;

View File

@ -1,343 +0,0 @@
# HG changeset patch
# User Ian Jackson <ian.jackson@eu.citrix.com>
# Date 1359031672 0
# Node ID a181bf3e77df891c97fc20dff4e9b90b7584022b
# Parent 3e93c50982de4f2f7db99d92b04684556320541c
libxl: fix stale fd event callback race
Because there is not necessarily any lock held at the point the
application (eg, libvirt) calls libxl_osevent_occurred_timeout and
..._fd, in a multithreaded program those calls may be arbitrarily
delayed in relation to other activities within the program.
libxl therefore needs to be prepared to receive very old event
callbacks. Arrange for this to be the case for fd callbacks.
This requires a new layer of indirection through a "hook nexus" struct
which can outlive the libxl__ev_foo. Allocation and deallocation of
these nexi is mostly handled in the OSEVENT macros which wrap up
the application's callbacks.
Document the problem and the solution in a comment in libxl_event.c
just before the definition of struct libxl__osevent_hook_nexus.
There is still a race relating to libxl__osevent_occurred_timeout;
this will be addressed in the following patch.
Reported-by: Bamvor Jian Zhang <bjzhang@suse.com>
Cc: Bamvor Jian Zhang <bjzhang@suse.com>
Cc: Ian Campbell <Ian.Campbell@citrix.com>
Tested-by: Jim Fehlig <jfehlig@suse.com>
Acked-by: Jim Fehlig <jfehlig@suse.com>
Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
Committed-by: Ian Campbell <ian.campbell@citrix.com>
Index: xen-4.2.1-testing/tools/libxl/libxl_event.c
===================================================================
--- xen-4.2.1-testing.orig/tools/libxl/libxl_event.c
+++ xen-4.2.1-testing/tools/libxl/libxl_event.c
@@ -38,23 +38,131 @@
* The application's registration hooks should be called ONLY via
* these macros, with the ctx locked. Likewise all the "occurred"
* entrypoints from the application should assert(!in_hook);
+ *
+ * During the hook call - including while the arguments are being
+ * evaluated - ev->nexus is guaranteed to be valid and refer to the
+ * nexus which is being used for this event registration. The
+ * arguments should specify ev->nexus for the for_libxl argument and
+ * ev->nexus->for_app_reg (or a pointer to it) for for_app_reg.
*/
-#define OSEVENT_HOOK_INTERN(retval, hookname, ...) do { \
- if (CTX->osevent_hooks) { \
- CTX->osevent_in_hook++; \
- retval CTX->osevent_hooks->hookname(CTX->osevent_user, __VA_ARGS__); \
- CTX->osevent_in_hook--; \
- } \
+#define OSEVENT_HOOK_INTERN(retval, failedp, evkind, hookop, nexusop, ...) do { \
+ if (CTX->osevent_hooks) { \
+ CTX->osevent_in_hook++; \
+ libxl__osevent_hook_nexi *nexi = &CTX->hook_##evkind##_nexi_idle; \
+ osevent_hook_pre_##nexusop(gc, ev, nexi, &ev->nexus); \
+ retval CTX->osevent_hooks->evkind##_##hookop \
+ (CTX->osevent_user, __VA_ARGS__); \
+ if ((failedp)) \
+ osevent_hook_failed_##nexusop(gc, ev, nexi, &ev->nexus); \
+ CTX->osevent_in_hook--; \
+ } \
} while (0)
-#define OSEVENT_HOOK(hookname, ...) ({ \
- int osevent_hook_rc = 0; \
- OSEVENT_HOOK_INTERN(osevent_hook_rc = , hookname, __VA_ARGS__); \
- osevent_hook_rc; \
+#define OSEVENT_HOOK(evkind, hookop, nexusop, ...) ({ \
+ int osevent_hook_rc = 0; \
+ OSEVENT_HOOK_INTERN(osevent_hook_rc =, !!osevent_hook_rc, \
+ evkind, hookop, nexusop, __VA_ARGS__); \
+ osevent_hook_rc; \
})
-#define OSEVENT_HOOK_VOID(hookname, ...) \
- OSEVENT_HOOK_INTERN(/* void */, hookname, __VA_ARGS__)
+#define OSEVENT_HOOK_VOID(evkind, hookop, nexusop, ...) \
+ OSEVENT_HOOK_INTERN(/* void */, 0, evkind, hookop, nexusop, __VA_ARGS__)
+
+/*
+ * The application's calls to libxl_osevent_occurred_... may be
+ * indefinitely delayed with respect to the rest of the program (since
+ * they are not necessarily called with any lock held). So the
+ * for_libxl value we receive may be (almost) arbitrarily old. All we
+ * know is that it came from this ctx.
+ *
+ * Therefore we may not free the object referred to by any for_libxl
+ * value until we free the whole libxl_ctx. And if we reuse it we
+ * must be able to tell when an old use turns up, and discard the
+ * stale event.
+ *
+ * Thus we cannot use the ev directly as the for_libxl value - we need
+ * a layer of indirection.
+ *
+ * We do this by keeping a pool of libxl__osevent_hook_nexus structs,
+ * and use pointers to them as for_libxl values. In fact, there are
+ * two pools: one for fds and one for timeouts. This ensures that we
+ * don't risk a type error when we upcast nexus->ev. In each nexus
+ * the ev is either null or points to a valid libxl__ev_time or
+ * libxl__ev_fd, as applicable.
+ *
+ * We /do/ allow ourselves to reassociate an old nexus with a new ev
+ * as otherwise we would have to leak nexi. (This reassociation
+ * might, of course, be an old ev being reused for a new purpose so
+ * simply comparing the ev pointer is not sufficient.) Thus the
+ * libxl_osevent_occurred functions need to check that the condition
+ * allegedly signalled by this event actually exists.
+ *
+ * The nexi and the lists are all protected by the ctx lock.
+ */
+
+struct libxl__osevent_hook_nexus {
+ void *ev;
+ void *for_app_reg;
+ LIBXL_SLIST_ENTRY(libxl__osevent_hook_nexus) next;
+};
+
+static void *osevent_ev_from_hook_nexus(libxl_ctx *ctx,
+ libxl__osevent_hook_nexus *nexus /* pass void *for_libxl */)
+{
+ return nexus->ev;
+}
+
+static void osevent_release_nexus(libxl__gc *gc,
+ libxl__osevent_hook_nexi *nexi_idle,
+ libxl__osevent_hook_nexus *nexus)
+{
+ nexus->ev = 0;
+ LIBXL_SLIST_INSERT_HEAD(nexi_idle, nexus, next);
+}
+
+/*----- OSEVENT* hook functions for nexusop "alloc" -----*/
+static void osevent_hook_pre_alloc(libxl__gc *gc, void *ev,
+ libxl__osevent_hook_nexi *nexi_idle,
+ libxl__osevent_hook_nexus **nexus_r)
+{
+ libxl__osevent_hook_nexus *nexus = LIBXL_SLIST_FIRST(nexi_idle);
+ if (nexus) {
+ LIBXL_SLIST_REMOVE_HEAD(nexi_idle, next);
+ } else {
+ nexus = libxl__zalloc(NOGC, sizeof(*nexus));
+ }
+ nexus->ev = ev;
+ *nexus_r = nexus;
+}
+static void osevent_hook_failed_alloc(libxl__gc *gc, void *ev,
+ libxl__osevent_hook_nexi *nexi_idle,
+ libxl__osevent_hook_nexus **nexus)
+{
+ osevent_release_nexus(gc, nexi_idle, *nexus);
+}
+
+/*----- OSEVENT* hook functions for nexusop "release" -----*/
+static void osevent_hook_pre_release(libxl__gc *gc, void *ev,
+ libxl__osevent_hook_nexi *nexi_idle,
+ libxl__osevent_hook_nexus **nexus)
+{
+ osevent_release_nexus(gc, nexi_idle, *nexus);
+}
+static void osevent_hook_failed_release(libxl__gc *gc, void *ev,
+ libxl__osevent_hook_nexi *nexi_idle,
+ libxl__osevent_hook_nexus **nexus)
+{
+ abort();
+}
+
+/*----- OSEVENT* hook functions for nexusop "noop" -----*/
+static void osevent_hook_pre_noop(libxl__gc *gc, void *ev,
+ libxl__osevent_hook_nexi *nexi_idle,
+ libxl__osevent_hook_nexus **nexus) { }
+static void osevent_hook_failed_noop(libxl__gc *gc, void *ev,
+ libxl__osevent_hook_nexi *nexi_idle,
+ libxl__osevent_hook_nexus **nexus) { }
+
/*
* fd events
@@ -72,7 +180,8 @@ int libxl__ev_fd_register(libxl__gc *gc,
DBG("ev_fd=%p register fd=%d events=%x", ev, fd, events);
- rc = OSEVENT_HOOK(fd_register, fd, &ev->for_app_reg, events, ev);
+ rc = OSEVENT_HOOK(fd,register, alloc, fd, &ev->nexus->for_app_reg,
+ events, ev->nexus);
if (rc) goto out;
ev->fd = fd;
@@ -97,7 +206,7 @@ int libxl__ev_fd_modify(libxl__gc *gc, l
DBG("ev_fd=%p modify fd=%d events=%x", ev, ev->fd, events);
- rc = OSEVENT_HOOK(fd_modify, ev->fd, &ev->for_app_reg, events);
+ rc = OSEVENT_HOOK(fd,modify, noop, ev->fd, &ev->nexus->for_app_reg, events);
if (rc) goto out;
ev->events = events;
@@ -119,7 +228,7 @@ void libxl__ev_fd_deregister(libxl__gc *
DBG("ev_fd=%p deregister fd=%d", ev, ev->fd);
- OSEVENT_HOOK_VOID(fd_deregister, ev->fd, ev->for_app_reg);
+ OSEVENT_HOOK_VOID(fd,deregister, release, ev->fd, ev->nexus->for_app_reg);
LIBXL_LIST_REMOVE(ev, entry);
ev->fd = -1;
@@ -171,7 +280,8 @@ static int time_register_finite(libxl__g
{
int rc;
- rc = OSEVENT_HOOK(timeout_register, &ev->for_app_reg, absolute, ev);
+ rc = OSEVENT_HOOK(timeout,register, alloc, &ev->nexus->for_app_reg,
+ absolute, ev->nexus);
if (rc) return rc;
ev->infinite = 0;
@@ -184,7 +294,7 @@ static int time_register_finite(libxl__g
static void time_deregister(libxl__gc *gc, libxl__ev_time *ev)
{
if (!ev->infinite) {
- OSEVENT_HOOK_VOID(timeout_deregister, ev->for_app_reg);
+ OSEVENT_HOOK_VOID(timeout,deregister, release, ev->nexus->for_app_reg);
LIBXL_TAILQ_REMOVE(&CTX->etimes, ev, entry);
}
}
@@ -270,7 +380,8 @@ int libxl__ev_time_modify_abs(libxl__gc
rc = time_register_finite(gc, ev, absolute);
if (rc) goto out;
} else {
- rc = OSEVENT_HOOK(timeout_modify, &ev->for_app_reg, absolute);
+ rc = OSEVENT_HOOK(timeout,modify, noop,
+ &ev->nexus->for_app_reg, absolute);
if (rc) goto out;
LIBXL_TAILQ_REMOVE(&CTX->etimes, ev, entry);
@@ -1009,35 +1120,54 @@ void libxl_osevent_register_hooks(libxl_
void libxl_osevent_occurred_fd(libxl_ctx *ctx, void *for_libxl,
- int fd, short events, short revents)
+ int fd, short events_ign, short revents_ign)
{
- libxl__ev_fd *ev = for_libxl;
-
EGC_INIT(ctx);
CTX_LOCK;
assert(!CTX->osevent_in_hook);
- assert(fd == ev->fd);
- revents &= ev->events;
- if (revents)
- ev->func(egc, ev, fd, ev->events, revents);
+ libxl__ev_fd *ev = osevent_ev_from_hook_nexus(ctx, for_libxl);
+ if (!ev) goto out;
+ if (ev->fd != fd) goto out;
+ struct pollfd check;
+ for (;;) {
+ check.fd = fd;
+ check.events = ev->events;
+ int r = poll(&check, 1, 0);
+ if (!r)
+ goto out;
+ if (r==1)
+ break;
+ assert(r<0);
+ if (errno != EINTR) {
+ LIBXL__EVENT_DISASTER(egc, "failed poll to check for fd", errno, 0);
+ goto out;
+ }
+ }
+
+ if (check.revents)
+ ev->func(egc, ev, fd, ev->events, check.revents);
+
+ out:
CTX_UNLOCK;
EGC_FREE;
}
void libxl_osevent_occurred_timeout(libxl_ctx *ctx, void *for_libxl)
{
- libxl__ev_time *ev = for_libxl;
-
EGC_INIT(ctx);
CTX_LOCK;
assert(!CTX->osevent_in_hook);
+ libxl__ev_time *ev = osevent_ev_from_hook_nexus(ctx, for_libxl);
+ if (!ev) goto out;
assert(!ev->infinite);
+
LIBXL_TAILQ_REMOVE(&CTX->etimes, ev, entry);
ev->func(egc, ev, &ev->abs);
+ out:
CTX_UNLOCK;
EGC_FREE;
}
Index: xen-4.2.1-testing/tools/libxl/libxl_internal.h
===================================================================
--- xen-4.2.1-testing.orig/tools/libxl/libxl_internal.h
+++ xen-4.2.1-testing/tools/libxl/libxl_internal.h
@@ -136,6 +136,8 @@ typedef struct libxl__gc libxl__gc;
typedef struct libxl__egc libxl__egc;
typedef struct libxl__ao libxl__ao;
typedef struct libxl__aop_occurred libxl__aop_occurred;
+typedef struct libxl__osevent_hook_nexus libxl__osevent_hook_nexus;
+typedef struct libxl__osevent_hook_nexi libxl__osevent_hook_nexi;
_hidden void libxl__alloc_failed(libxl_ctx *, const char *func,
size_t nmemb, size_t size) __attribute__((noreturn));
@@ -163,7 +165,7 @@ struct libxl__ev_fd {
libxl__ev_fd_callback *func;
/* remainder is private for libxl__ev_fd... */
LIBXL_LIST_ENTRY(libxl__ev_fd) entry;
- void *for_app_reg;
+ libxl__osevent_hook_nexus *nexus;
};
@@ -178,7 +180,7 @@ struct libxl__ev_time {
int infinite; /* not registered in list or with app if infinite */
LIBXL_TAILQ_ENTRY(libxl__ev_time) entry;
struct timeval abs;
- void *for_app_reg;
+ libxl__osevent_hook_nexus *nexus;
};
typedef struct libxl__ev_xswatch libxl__ev_xswatch;
@@ -329,6 +331,8 @@ struct libxl__ctx {
libxl__poller poller_app; /* libxl_osevent_beforepoll and _afterpoll */
LIBXL_LIST_HEAD(, libxl__poller) pollers_event, pollers_idle;
+ LIBXL_SLIST_HEAD(libxl__osevent_hook_nexi, libxl__osevent_hook_nexus)
+ hook_fd_nexi_idle, hook_timeout_nexi_idle;
LIBXL_LIST_HEAD(, libxl__ev_fd) efds;
LIBXL_TAILQ_HEAD(, libxl__ev_time) etimes;

View File

@ -1,228 +0,0 @@
# HG changeset patch
# User Ian Jackson <ian.jackson@eu.citrix.com>
# Date 1359031673 0
# Node ID a162a72e719a85799e3b08f52af7bb2147a407b8
# Parent a181bf3e77df891c97fc20dff4e9b90b7584022b
libxl: fix stale timeout event callback race
Because there is not necessarily any lock held at the point the
application (eg, libvirt) calls libxl_osevent_occurred_timeout, in a
multithreaded program those calls may be arbitrarily delayed in
relation to other activities within the program.
Specifically this means when ->timeout_deregister returns, libxl does
not know whether it can safely dispose of the for_libxl value or
whether it needs to retain it in case of an in-progress call to
_occurred_timeout.
The interface could be fixed by requiring the application to make a
new call into libxl to say that the deregistration was complete.
However that new call would have to be threaded through the
application's event loop; this is complicated and some application
authors are likely not to implement it properly. Furthermore the
easiest way to implement this facility in most event loops is to queue
up a time event for "now".
Shortcut all of this by having libxl always call timeout_modify
setting abs={0,0} (ie, ASAP) instead of timeout_deregister. This will
cause the application to call _occurred_timeout. When processing this
calldown we see that we were no longer actually interested and simply
throw it away.
Additionally, there is a race between _occurred_timeout and
->timeout_modify. If libxl ever adjusts the deadline for a timeout
the application may already be in the process of calling _occurred, in
which case the situation with for_app's lifetime becomes very
complicated. Therefore abolish libxl__ev_time_modify_{abs,rel} (which
have no callers) and promise to the application only ever to call
->timeout_modify with abs=={0,0}. The application still needs to cope
with ->timeout_modify racing with its internal function which calls
_occurred_timeout. Document this.
This is a forwards-compatible change for applications using the libxl
API, and will hopefully eliminate these races in callback-supplying
applications (such as libvirt) without the need for corresponding
changes to the application. (It is possible that this might expose
bugs in applications, though, as previously libxl would never call
libxl_osevent_hooks->timeout_modify and now it never calls
->timeout_deregister).
For clarity, fold the body of time_register_finite into its one
remaining call site. This makes the semantics of ev->infinite
slightly clearer.
Cc: Bamvor Jian Zhang <bjzhang@suse.com>
Cc: Ian Campbell <Ian.Campbell@citrix.com>
Tested-by: Jim Fehlig <jfehlig@suse.com>
Acked-by: Jim Fehlig <jfehlig@suse.com>
Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
Committed-by: Ian Campbell <ian.campbell@citrix.com>
Index: xen-4.2.1-testing/tools/libxl/libxl_event.c
===================================================================
--- xen-4.2.1-testing.orig/tools/libxl/libxl_event.c
+++ xen-4.2.1-testing/tools/libxl/libxl_event.c
@@ -267,18 +267,11 @@ static int time_rel_to_abs(libxl__gc *gc
return 0;
}
-static void time_insert_finite(libxl__gc *gc, libxl__ev_time *ev)
-{
- libxl__ev_time *evsearch;
- LIBXL_TAILQ_INSERT_SORTED(&CTX->etimes, entry, ev, evsearch, /*empty*/,
- timercmp(&ev->abs, &evsearch->abs, >));
- ev->infinite = 0;
-}
-
static int time_register_finite(libxl__gc *gc, libxl__ev_time *ev,
struct timeval absolute)
{
int rc;
+ libxl__ev_time *evsearch;
rc = OSEVENT_HOOK(timeout,register, alloc, &ev->nexus->for_app_reg,
absolute, ev->nexus);
@@ -286,7 +279,8 @@ static int time_register_finite(libxl__g
ev->infinite = 0;
ev->abs = absolute;
- time_insert_finite(gc, ev);
+ LIBXL_TAILQ_INSERT_SORTED(&CTX->etimes, entry, ev, evsearch, /*empty*/,
+ timercmp(&ev->abs, &evsearch->abs, >));
return 0;
}
@@ -294,7 +288,12 @@ static int time_register_finite(libxl__g
static void time_deregister(libxl__gc *gc, libxl__ev_time *ev)
{
if (!ev->infinite) {
- OSEVENT_HOOK_VOID(timeout,deregister, release, ev->nexus->for_app_reg);
+ struct timeval right_away = { 0, 0 };
+ if (ev->nexus) /* only set if app provided hooks */
+ ev->nexus->ev = 0;
+ OSEVENT_HOOK_VOID(timeout,modify,
+ noop /* release nexus in _occurred_ */,
+ &ev->nexus->for_app_reg, right_away);
LIBXL_TAILQ_REMOVE(&CTX->etimes, ev, entry);
}
}
@@ -364,70 +363,6 @@ int libxl__ev_time_register_rel(libxl__g
return rc;
}
-int libxl__ev_time_modify_abs(libxl__gc *gc, libxl__ev_time *ev,
- struct timeval absolute)
-{
- int rc;
-
- CTX_LOCK;
-
- DBG("ev_time=%p modify abs==%lu.%06lu",
- ev, (unsigned long)absolute.tv_sec, (unsigned long)absolute.tv_usec);
-
- assert(libxl__ev_time_isregistered(ev));
-
- if (ev->infinite) {
- rc = time_register_finite(gc, ev, absolute);
- if (rc) goto out;
- } else {
- rc = OSEVENT_HOOK(timeout,modify, noop,
- &ev->nexus->for_app_reg, absolute);
- if (rc) goto out;
-
- LIBXL_TAILQ_REMOVE(&CTX->etimes, ev, entry);
- ev->abs = absolute;
- time_insert_finite(gc, ev);
- }
-
- rc = 0;
- out:
- time_done_debug(gc,__func__,ev,rc);
- CTX_UNLOCK;
- return rc;
-}
-
-int libxl__ev_time_modify_rel(libxl__gc *gc, libxl__ev_time *ev,
- int milliseconds)
-{
- struct timeval absolute;
- int rc;
-
- CTX_LOCK;
-
- DBG("ev_time=%p modify ms=%d", ev, milliseconds);
-
- assert(libxl__ev_time_isregistered(ev));
-
- if (milliseconds < 0) {
- time_deregister(gc, ev);
- ev->infinite = 1;
- rc = 0;
- goto out;
- }
-
- rc = time_rel_to_abs(gc, milliseconds, &absolute);
- if (rc) goto out;
-
- rc = libxl__ev_time_modify_abs(gc, ev, absolute);
- if (rc) goto out;
-
- rc = 0;
- out:
- time_done_debug(gc,__func__,ev,rc);
- CTX_UNLOCK;
- return rc;
-}
-
void libxl__ev_time_deregister(libxl__gc *gc, libxl__ev_time *ev)
{
CTX_LOCK;
@@ -1160,7 +1095,11 @@ void libxl_osevent_occurred_timeout(libx
CTX_LOCK;
assert(!CTX->osevent_in_hook);
- libxl__ev_time *ev = osevent_ev_from_hook_nexus(ctx, for_libxl);
+ libxl__osevent_hook_nexus *nexus = for_libxl;
+ libxl__ev_time *ev = osevent_ev_from_hook_nexus(ctx, nexus);
+
+ osevent_release_nexus(gc, &CTX->hook_timeout_nexi_idle, nexus);
+
if (!ev) goto out;
assert(!ev->infinite);
Index: xen-4.2.1-testing/tools/libxl/libxl_event.h
===================================================================
--- xen-4.2.1-testing.orig/tools/libxl/libxl_event.h
+++ xen-4.2.1-testing/tools/libxl/libxl_event.h
@@ -287,8 +287,10 @@ typedef struct libxl_osevent_hooks {
int (*timeout_register)(void *user, void **for_app_registration_out,
struct timeval abs, void *for_libxl);
int (*timeout_modify)(void *user, void **for_app_registration_update,
- struct timeval abs);
- void (*timeout_deregister)(void *user, void *for_app_registration);
+ struct timeval abs)
+ /* only ever called with abs={0,0}, meaning ASAP */;
+ void (*timeout_deregister)(void *user, void *for_app_registration)
+ /* will never be called */;
} libxl_osevent_hooks;
/* The application which calls register_fd_hooks promises to
@@ -337,6 +339,17 @@ typedef struct libxl_osevent_hooks {
* register (or modify), and pass it to subsequent calls to modify
* or deregister.
*
+ * Note that the application must cope with a call from libxl to
+ * timeout_modify racing with its own call to
+ * libxl__osevent_occurred_timeout. libxl guarantees that
+ * timeout_modify will only be called with abs={0,0} but the
+ * application must still ensure that libxl's attempt to cause the
+ * timeout to occur immediately is safely ignored even the timeout is
+ * actually already in the process of occurring.
+ *
+ * timeout_deregister is not used because it forms part of a
+ * deprecated unsafe mode of use of the API.
+ *
* osevent_register_hooks may be called only once for each libxl_ctx.
* libxl may make calls to register/modify/deregister from within
* any libxl function (indeed, it will usually call register from

View File

@ -1,64 +0,0 @@
# HG changeset patch
# User Keir Fraser <keir@xen.org>
# Date 1359566139 28800
# Node ID 8201b6ec3564c80db5516cdcf36dcfa9b7fdd93b
# Parent 1fe8ecfdf10cc9077fc810364663a0f25a5c5b96
vmx: Simplify cr0 update handling by deferring cr4 changes to the cr4 handler.
Signed-off-by: Keir Fraser <keir@xen.org>
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -1133,20 +1133,18 @@ static void vmx_update_guest_cr(struct v
if ( paging_mode_hap(v->domain) )
{
- /* We manage GUEST_CR3 when guest CR0.PE is zero or when cr3 memevents are on */
+ /* Manage GUEST_CR3 when CR0.PE=0. */
uint32_t cr3_ctls = (CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING);
v->arch.hvm_vmx.exec_control &= ~cr3_ctls;
if ( !hvm_paging_enabled(v) )
v->arch.hvm_vmx.exec_control |= cr3_ctls;
+ /* Trap CR3 updates if CR3 memory events are enabled. */
if ( v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_CR3] )
v->arch.hvm_vmx.exec_control |= CPU_BASED_CR3_LOAD_EXITING;
vmx_update_cpu_exec_control(v);
-
- /* Changing CR0.PE can change some bits in real CR4. */
- vmx_update_guest_cr(v, 4);
}
if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
@@ -1176,8 +1174,6 @@ static void vmx_update_guest_cr(struct v
{
for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
vmx_set_segment_register(v, s, &reg[s]);
- v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME;
- __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
v->arch.hvm_vmx.exception_bitmap = 0xffffffff;
vmx_update_exception_bitmap(v);
}
@@ -1187,10 +1183,6 @@ static void vmx_update_guest_cr(struct v
if ( !(v->arch.hvm_vmx.vm86_segment_mask & (1<<s)) )
vmx_set_segment_register(
v, s, &v->arch.hvm_vmx.vm86_saved_seg[s]);
- v->arch.hvm_vcpu.hw_cr[4] =
- ((v->arch.hvm_vcpu.hw_cr[4] & ~X86_CR4_VME)
- |(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_VME));
- __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
v->arch.hvm_vmx.exception_bitmap = HVM_TRAP_MASK
| (paging_mode_hap(v->domain) ?
0 : (1U << TRAP_page_fault))
@@ -1204,6 +1196,9 @@ static void vmx_update_guest_cr(struct v
v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask;
__vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
__vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]);
+
+ /* Changing CR0 can change some bits in real CR4. */
+ vmx_update_guest_cr(v, 4);
break;
}
case 2:

View File

@ -1,39 +0,0 @@
# HG changeset patch
# User Dongxiao Xu <dongxiao.xu@intel.com>
# Date 1359566250 28800
# Node ID d1bf3b21f78302dad1ed53e540facf7b9a0e2ab5
# Parent 8201b6ec3564c80db5516cdcf36dcfa9b7fdd93b
VMX: disable SMEP feature when guest is in non-paging mode
SMEP is disabled if CPU is in non-paging mode in hardware.
However Xen always uses paging mode to emulate guest non-paging
mode with HAP. To emulate this behavior, SMEP needs to be manually
disabled when guest switches to non-paging mode.
We met an issue that, SMP Linux guest with recent kernel (enable
SMEP support, for example, 3.5.3) would crash with triple fault if
setting unrestricted_guest=0 in grub. This is because Xen uses an
identity mapping page table to emulate the non-paging mode, where
the page table is set with USER flag. If SMEP is still enabled in
this case, guest will meet unhandlable page fault and then crash.
Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Committed-by: Keir Fraser <keir@xen.org>
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -1227,6 +1227,13 @@ static void vmx_update_guest_cr(struct v
{
v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE;
v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
+ /*
+ * SMEP is disabled if CPU is in non-paging mode in hardware.
+ * However Xen always uses paging mode to emulate guest non-paging
+ * mode with HAP. To emulate this behavior, SMEP needs to be
+ * manually disabled when guest switches to non-paging mode.
+ */
+ v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_SMEP;
}
__vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
__vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]);

View File

@ -1,37 +0,0 @@
References: CVE-2013-0153 XSA-36 bnc#800275
# HG changeset patch
# User Boris Ostrovsky <boris.ostrovsky@amd.com>
# Date 1360073898 -3600
# Node ID 32d4516a97f0b22ed06155f7b8e0bff075024991
# Parent 2fdca30363f08026971c094e8a1a84e19ca3e55b
ACPI: acpi_table_parse() should return handler's error code
Currently, the error code returned by acpi_table_parse()'s handler
is ignored. This patch will propagate handler's return value to
acpi_table_parse()'s caller.
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@amd.com>
Committed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/drivers/acpi/tables.c
+++ b/xen/drivers/acpi/tables.c
@@ -267,7 +267,7 @@ acpi_table_parse_madt(enum acpi_madt_typ
* @handler: handler to run
*
* Scan the ACPI System Descriptor Table (STD) for a table matching @id,
- * run @handler on it. Return 0 if table found, return on if not.
+ * run @handler on it.
*/
int __init acpi_table_parse(char *id, acpi_table_handler handler)
{
@@ -282,8 +282,7 @@ int __init acpi_table_parse(char *id, ac
acpi_get_table(id, 0, &table);
if (table) {
- handler(table);
- return 0;
+ return handler(table);
} else
return 1;
}

View File

@ -1,205 +0,0 @@
References: CVE-2013-0153 XSA-36 bnc#800275
# HG changeset patch
# User Jan Beulich <jbeulich@suse.com>
# Date 1360074047 -3600
# Node ID 601139e2b0db7dc8a5bb69b9b7373fb87742741c
# Parent 32d4516a97f0b22ed06155f7b8e0bff075024991
AMD,IOMMU: Clean up old entries in remapping tables when creating new one
When changing the affinity of an IRQ associated with a passed
through PCI device, clear previous mapping.
This is XSA-36 / CVE-2013-0153.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
In addition, because some BIOSes may incorrectly program IVRS
entries for IOAPIC try to check for entry's consistency. Specifically,
if conflicting entries are found disable IOMMU if per-device
remapping table is used. If entries refer to bogus IOAPIC IDs
disable IOMMU unconditionally
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@amd.com>
--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -22,6 +22,7 @@
#include <xen/errno.h>
#include <xen/acpi.h>
#include <asm/apicdef.h>
+#include <asm/io_apic.h>
#include <asm/amd-iommu.h>
#include <asm/hvm/svm/amd-iommu-proto.h>
@@ -635,6 +636,7 @@ static u16 __init parse_ivhd_device_spec
u16 header_length, u16 block_length, struct amd_iommu *iommu)
{
u16 dev_length, bdf;
+ int apic;
dev_length = sizeof(*special);
if ( header_length < (block_length + dev_length) )
@@ -651,10 +653,59 @@ static u16 __init parse_ivhd_device_spec
}
add_ivrs_mapping_entry(bdf, bdf, special->header.data_setting, iommu);
- /* set device id of ioapic */
- ioapic_sbdf[special->handle].bdf = bdf;
- ioapic_sbdf[special->handle].seg = seg;
- return dev_length;
+
+ if ( special->variety != ACPI_IVHD_IOAPIC )
+ {
+ if ( special->variety != ACPI_IVHD_HPET )
+ printk(XENLOG_ERR "Unrecognized IVHD special variety %#x\n",
+ special->variety);
+ return dev_length;
+ }
+
+ /*
+ * Some BIOSes have IOAPIC broken entries so we check for IVRS
+ * consistency here --- whether entry's IOAPIC ID is valid and
+ * whether there are conflicting/duplicated entries.
+ */
+ for ( apic = 0; apic < nr_ioapics; apic++ )
+ {
+ if ( IO_APIC_ID(apic) != special->handle )
+ continue;
+
+ if ( ioapic_sbdf[special->handle].pin_setup )
+ {
+ if ( ioapic_sbdf[special->handle].bdf == bdf &&
+ ioapic_sbdf[special->handle].seg == seg )
+ AMD_IOMMU_DEBUG("IVHD Warning: Duplicate IO-APIC %#x entries\n",
+ special->handle);
+ else
+ {
+ printk(XENLOG_ERR "IVHD Error: Conflicting IO-APIC %#x entries\n",
+ special->handle);
+ if ( amd_iommu_perdev_intremap )
+ return 0;
+ }
+ }
+ else
+ {
+ /* set device id of ioapic */
+ ioapic_sbdf[special->handle].bdf = bdf;
+ ioapic_sbdf[special->handle].seg = seg;
+
+ ioapic_sbdf[special->handle].pin_setup = xzalloc_array(
+ unsigned long, BITS_TO_LONGS(nr_ioapic_entries[apic]));
+ if ( nr_ioapic_entries[apic] &&
+ !ioapic_sbdf[IO_APIC_ID(apic)].pin_setup )
+ {
+ printk(XENLOG_ERR "IVHD Error: Out of memory\n");
+ return 0;
+ }
+ }
+ return dev_length;
+ }
+
+ printk(XENLOG_ERR "IVHD Error: Invalid IO-APIC %#x\n", special->handle);
+ return 0;
}
static int __init parse_ivhd_block(const struct acpi_ivrs_hardware *ivhd_block)
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -99,12 +99,12 @@ static void update_intremap_entry(u32* e
static void update_intremap_entry_from_ioapic(
int bdf,
struct amd_iommu *iommu,
- struct IO_APIC_route_entry *ioapic_rte)
+ const struct IO_APIC_route_entry *rte,
+ const struct IO_APIC_route_entry *old_rte)
{
unsigned long flags;
u32* entry;
u8 delivery_mode, dest, vector, dest_mode;
- struct IO_APIC_route_entry *rte = ioapic_rte;
int req_id;
spinlock_t *lock;
int offset;
@@ -120,6 +120,14 @@ static void update_intremap_entry_from_i
spin_lock_irqsave(lock, flags);
offset = get_intremap_offset(vector, delivery_mode);
+ if ( old_rte )
+ {
+ int old_offset = get_intremap_offset(old_rte->vector,
+ old_rte->delivery_mode);
+
+ if ( offset != old_offset )
+ free_intremap_entry(iommu->seg, bdf, old_offset);
+ }
entry = (u32*)get_intremap_entry(iommu->seg, req_id, offset);
update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest);
@@ -188,6 +196,7 @@ int __init amd_iommu_setup_ioapic_remapp
amd_iommu_flush_intremap(iommu, req_id);
spin_unlock_irqrestore(&iommu->lock, flags);
}
+ set_bit(pin, ioapic_sbdf[IO_APIC_ID(apic)].pin_setup);
}
}
return 0;
@@ -199,6 +208,7 @@ void amd_iommu_ioapic_update_ire(
struct IO_APIC_route_entry old_rte = { 0 };
struct IO_APIC_route_entry new_rte = { 0 };
unsigned int rte_lo = (reg & 1) ? reg - 1 : reg;
+ unsigned int pin = (reg - 0x10) / 2;
int saved_mask, seg, bdf;
struct amd_iommu *iommu;
@@ -236,6 +246,14 @@ void amd_iommu_ioapic_update_ire(
*(((u32 *)&new_rte) + 1) = value;
}
+ if ( new_rte.mask &&
+ !test_bit(pin, ioapic_sbdf[IO_APIC_ID(apic)].pin_setup) )
+ {
+ ASSERT(saved_mask);
+ __io_apic_write(apic, reg, value);
+ return;
+ }
+
/* mask the interrupt while we change the intremap table */
if ( !saved_mask )
{
@@ -244,7 +262,11 @@ void amd_iommu_ioapic_update_ire(
}
/* Update interrupt remapping entry */
- update_intremap_entry_from_ioapic(bdf, iommu, &new_rte);
+ update_intremap_entry_from_ioapic(
+ bdf, iommu, &new_rte,
+ test_and_set_bit(pin,
+ ioapic_sbdf[IO_APIC_ID(apic)].pin_setup) ? &old_rte
+ : NULL);
/* Forward write access to IO-APIC RTE */
__io_apic_write(apic, reg, value);
@@ -354,6 +376,12 @@ void amd_iommu_msi_msg_update_ire(
return;
}
+ if ( msi_desc->remap_index >= 0 )
+ update_intremap_entry_from_msi_msg(iommu, pdev, msi_desc, NULL);
+
+ if ( !msg )
+ return;
+
update_intremap_entry_from_msi_msg(iommu, pdev, msi_desc, msg);
}
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -100,6 +100,7 @@ void amd_iommu_read_msi_from_ire(
extern struct ioapic_sbdf {
u16 bdf, seg;
+ unsigned long *pin_setup;
} ioapic_sbdf[MAX_IO_APICS];
extern void *shared_intremap_table;

View File

@ -1,77 +0,0 @@
References: CVE-2013-0153 XSA-36 bnc#800275
# HG changeset patch
# User Boris Ostrovsky <boris.ostrovsky@amd.com>
# Date 1360074085 -3600
# Node ID e379a23b04655e9e43dc50944a5c9d1e59d8bee9
# Parent 601139e2b0db7dc8a5bb69b9b7373fb87742741c
AMD,IOMMU: Disable IOMMU if SATA Combined mode is on
AMD's SP5100 chipset can be placed into SATA Combined mode
that may cause prevent dom0 from booting when IOMMU is
enabled and per-device interrupt remapping table is used.
While SP5100 erratum 28 requires BIOSes to disable this mode,
some may still use it.
This patch checks whether this mode is on and, if per-device
table is in use, disables IOMMU.
This is XSA-36 / CVE-2013-0153.
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@amd.com>
Flipped operands of && in amd_iommu_init() to make the message issued
by amd_sp5100_erratum28() match reality (when amd_iommu_perdev_intremap
is zero, there's really no point in calling the function).
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Committed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -1118,12 +1118,45 @@ static int __init amd_iommu_setup_device
return 0;
}
+/* Check whether SP5100 SATA Combined mode is on */
+static bool_t __init amd_sp5100_erratum28(void)
+{
+ u32 bus, id;
+ u16 vendor_id, dev_id;
+ u8 byte;
+
+ for (bus = 0; bus < 256; bus++)
+ {
+ id = pci_conf_read32(0, bus, 0x14, 0, PCI_VENDOR_ID);
+
+ vendor_id = id & 0xffff;
+ dev_id = (id >> 16) & 0xffff;
+
+ /* SP5100 SMBus module sets Combined mode on */
+ if (vendor_id != 0x1002 || dev_id != 0x4385)
+ continue;
+
+ byte = pci_conf_read8(0, bus, 0x14, 0, 0xad);
+ if ( (byte >> 3) & 1 )
+ {
+ printk(XENLOG_WARNING "AMD-Vi: SP5100 erratum 28 detected, disabling IOMMU.\n"
+ "If possible, disable SATA Combined mode in BIOS or contact your vendor for BIOS update.\n");
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
int __init amd_iommu_init(void)
{
struct amd_iommu *iommu;
BUG_ON( !iommu_found() );
+ if ( amd_iommu_perdev_intremap && amd_sp5100_erratum28() )
+ goto error_out;
+
ivrs_bdf_entries = amd_iommu_get_ivrs_dev_entries();
if ( !ivrs_bdf_entries )

View File

@ -1,55 +0,0 @@
References: CVE-2013-0153 XSA-36 bnc#800275
# HG changeset patch
# User Boris Ostrovsky <boris.ostrovsky@amd.com>
# Date 1360074131 -3600
# Node ID 1af531e7bc2fc518f16d8d1461083c528e1517cf
# Parent e379a23b04655e9e43dc50944a5c9d1e59d8bee9
AMD,IOMMU: Make per-device interrupt remapping table default
Using global interrupt remapping table may be insecure, as
described by XSA-36. This patch makes per-device mode default.
This is XSA-36 / CVE-2013-0153.
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@amd.com>
Moved warning in amd_iov_detect() to location covering all cases.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Committed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -1942,9 +1942,6 @@ int map_domain_pirq(
spin_lock_irqsave(&desc->lock, flags);
set_domain_irq_pirq(d, irq, info);
spin_unlock_irqrestore(&desc->lock, flags);
-
- if ( opt_irq_vector_map == OPT_IRQ_VECTOR_MAP_PERDEV )
- printk(XENLOG_INFO "Per-device vector maps for GSIs not implemented yet.\n");
}
done:
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -204,6 +204,8 @@ int __init amd_iov_detect(void)
{
printk("AMD-Vi: Not overriding irq_vector_map setting\n");
}
+ if ( !amd_iommu_perdev_intremap )
+ printk(XENLOG_WARNING "AMD-Vi: Using global interrupt remap table is not recommended (see XSA-36)!\n");
return scan_pci_devices();
}
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -52,7 +52,7 @@ bool_t __read_mostly iommu_qinval = 1;
bool_t __read_mostly iommu_intremap = 1;
bool_t __read_mostly iommu_hap_pt_share = 1;
bool_t __read_mostly iommu_debug;
-bool_t __read_mostly amd_iommu_perdev_intremap;
+bool_t __read_mostly amd_iommu_perdev_intremap = 1;
DEFINE_PER_CPU(bool_t, iommu_dont_flush_iotlb);

View File

@ -1,37 +0,0 @@
# HG changeset patch
# User Olaf Hering <olaf@aepfle.de>
# Date 1360664991 -3600
# Node ID a37aa55c3cbcb0e8340b4985314ef8fb31d7610b
# Parent 9af6e566befe5516e66b62197813aa22e1d7122c
unmodified_drivers: __devinit was removed in linux-3.8
Signed-off-by: Olaf Hering <olaf@aepfle.de>
Merge with __init handling.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Committed-by: Jan Beulich <jbeulich@suse.com>
--- a/unmodified_drivers/linux-2.6/compat-include/xen/platform-compat.h
+++ b/unmodified_drivers/linux-2.6/compat-include/xen/platform-compat.h
@@ -13,10 +13,19 @@
#define DEFINE_SPINLOCK(x) spinlock_t x = SPIN_LOCK_UNLOCKED
#endif
-#if defined(_LINUX_INIT_H) && !defined(__init)
+#ifdef _LINUX_INIT_H
+
+#ifndef __init
#define __init
#endif
+#ifndef __devinit
+#define __devinit
+#define __devinitdata
+#endif
+
+#endif /* _LINUX_INIT_H */
+
#if defined(__LINUX_CACHE_H) && !defined(__read_mostly)
#define __read_mostly
#endif

View File

@ -1,21 +0,0 @@
# HG changeset patch
# User Keir Fraser <keir@xen.org>
# Date 1360775011 0
# Node ID 97b7e546e2e4a021491e198a33f7d685550ebc73
# Parent 742dde457258422a3d08e3ddbf9a7eae55c93acb
gcc4.8 build fix: Add -Wno-unused-local-typedefs to CFLAGS.
Based on a patch by M A Young <m.a.young@durham.ac.uk>
Signed-off-by: Keir Fraser <keir@xen.org>
--- a/Config.mk
+++ b/Config.mk
@@ -166,6 +166,7 @@ CFLAGS-$(clang) += -Wno-parentheses -Wno
$(call cc-option-add,HOSTCFLAGS,HOSTCC,-Wdeclaration-after-statement)
$(call cc-option-add,CFLAGS,CC,-Wdeclaration-after-statement)
$(call cc-option-add,CFLAGS,CC,-Wno-unused-but-set-variable)
+$(call cc-option-add,CFLAGS,CC,-Wno-unused-local-typedefs)
LDFLAGS += $(foreach i, $(EXTRA_LIB), -L$(i))
CFLAGS += $(foreach i, $(EXTRA_INCLUDES), -I$(i))

View File

@ -1,127 +0,0 @@
References: CVE-2013-0153 XSA-36 bnc#800275
# HG changeset patch
# User Jan Beulich <jbeulich@suse.com>
# Date 1360831252 -3600
# Node ID e68f14b9e73925e9d404e517ba510f73fe472e4e
# Parent c43be17eec0602015fc6461d1f13c992ba330c20
AMD IOMMU: also spot missing IO-APIC entries in IVRS table
Apart from dealing duplicate conflicting entries, we also have to
handle firmware omitting IO-APIC entries in IVRS altogether. Not doing
so has resulted in c/s 26517:601139e2b0db to crash such systems during
boot (whereas with the change here the IOMMU gets disabled just as is
being done in the other cases, i.e. unless global tables are being
used).
Debugging this issue has also pointed out that the debug log output is
pretty ugly to look at - consolidate the output, and add one extra
item for the IVHD special entries, so that future issues are easier
to analyze.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Tested-by: Sander Eikelenboom <linux@eikelenboom.it>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -352,9 +352,8 @@ static int __init parse_ivmd_block(const
base = start_addr & PAGE_MASK;
limit = (start_addr + mem_length - 1) & PAGE_MASK;
- AMD_IOMMU_DEBUG("IVMD Block: Type 0x%x\n",ivmd_block->header.type);
- AMD_IOMMU_DEBUG(" Start_Addr_Phys 0x%lx\n", start_addr);
- AMD_IOMMU_DEBUG(" Mem_Length 0x%lx\n", mem_length);
+ AMD_IOMMU_DEBUG("IVMD Block: type %#x phys %#lx len %#lx\n",
+ ivmd_block->header.type, start_addr, mem_length);
if ( ivmd_block->header.flags & ACPI_IVMD_EXCLUSION_RANGE )
iw = ir = IOMMU_CONTROL_ENABLED;
@@ -549,8 +548,8 @@ static u16 __init parse_ivhd_device_alia
return 0;
}
- AMD_IOMMU_DEBUG(" Dev_Id Range: 0x%x -> 0x%x\n", first_bdf, last_bdf);
- AMD_IOMMU_DEBUG(" Dev_Id Alias: 0x%x\n", alias_id);
+ AMD_IOMMU_DEBUG(" Dev_Id Range: %#x -> %#x alias %#x\n",
+ first_bdf, last_bdf, alias_id);
for ( bdf = first_bdf; bdf <= last_bdf; bdf++ )
add_ivrs_mapping_entry(bdf, alias_id, range->alias.header.data_setting,
@@ -652,6 +651,9 @@ static u16 __init parse_ivhd_device_spec
return 0;
}
+ AMD_IOMMU_DEBUG("IVHD Special: %04x:%02x:%02x.%u variety %#x handle %#x\n",
+ seg, PCI_BUS(bdf), PCI_SLOT(bdf), PCI_FUNC(bdf),
+ special->variety, special->handle);
add_ivrs_mapping_entry(bdf, bdf, special->header.data_setting, iommu);
if ( special->variety != ACPI_IVHD_IOAPIC )
@@ -737,10 +739,9 @@ static int __init parse_ivhd_block(const
{
ivhd_device = (const void *)((const u8 *)ivhd_block + block_length);
- AMD_IOMMU_DEBUG( "IVHD Device Entry:\n");
- AMD_IOMMU_DEBUG( " Type 0x%x\n", ivhd_device->header.type);
- AMD_IOMMU_DEBUG( " Dev_Id 0x%x\n", ivhd_device->header.id);
- AMD_IOMMU_DEBUG( " Flags 0x%x\n", ivhd_device->header.data_setting);
+ AMD_IOMMU_DEBUG("IVHD Device Entry: type %#x id %#x flags %#x\n",
+ ivhd_device->header.type, ivhd_device->header.id,
+ ivhd_device->header.data_setting);
switch ( ivhd_device->header.type )
{
@@ -869,6 +870,7 @@ static int __init parse_ivrs_table(struc
{
const struct acpi_ivrs_header *ivrs_block;
unsigned long length;
+ unsigned int apic;
int error = 0;
BUG_ON(!table);
@@ -882,11 +884,9 @@ static int __init parse_ivrs_table(struc
{
ivrs_block = (struct acpi_ivrs_header *)((u8 *)table + length);
- AMD_IOMMU_DEBUG("IVRS Block:\n");
- AMD_IOMMU_DEBUG(" Type 0x%x\n", ivrs_block->type);
- AMD_IOMMU_DEBUG(" Flags 0x%x\n", ivrs_block->flags);
- AMD_IOMMU_DEBUG(" Length 0x%x\n", ivrs_block->length);
- AMD_IOMMU_DEBUG(" Dev_Id 0x%x\n", ivrs_block->device_id);
+ AMD_IOMMU_DEBUG("IVRS Block: type %#x flags %#x len %#x id %#x\n",
+ ivrs_block->type, ivrs_block->flags,
+ ivrs_block->length, ivrs_block->device_id);
if ( table->length < (length + ivrs_block->length) )
{
@@ -901,6 +901,29 @@ static int __init parse_ivrs_table(struc
length += ivrs_block->length;
}
+ /* Each IO-APIC must have been mentioned in the table. */
+ for ( apic = 0; !error && apic < nr_ioapics; ++apic )
+ {
+ if ( !nr_ioapic_entries[apic] ||
+ ioapic_sbdf[IO_APIC_ID(apic)].pin_setup )
+ continue;
+
+ printk(XENLOG_ERR "IVHD Error: no information for IO-APIC %#x\n",
+ IO_APIC_ID(apic));
+ if ( amd_iommu_perdev_intremap )
+ error = -ENXIO;
+ else
+ {
+ ioapic_sbdf[IO_APIC_ID(apic)].pin_setup = xzalloc_array(
+ unsigned long, BITS_TO_LONGS(nr_ioapic_entries[apic]));
+ if ( !ioapic_sbdf[IO_APIC_ID(apic)].pin_setup )
+ {
+ printk(XENLOG_ERR "IVHD Error: Out of memory\n");
+ error = -ENOMEM;
+ }
+ }
+ }
+
return error;
}

View File

@ -1,39 +0,0 @@
# HG changeset patch
# User Tim Deegan <tim@xen.org>
# Date 1360917722 -3600
# Node ID 0cca8a18432f08b342d76a753aa98559d892f592
# Parent 7af3c38ae187b351c5cea58e9eee482b50d814d8
xenoprof: avoid division by 0
Signed-off-by: Tim Deegan <tim@xen.org>
Acked-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Keir Fraser <keir@xen.org>
Committed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/common/xenoprof.c
+++ b/xen/common/xenoprof.c
@@ -193,6 +193,13 @@ static int alloc_xenoprof_struct(
unsigned max_max_samples;
int i;
+ nvcpu = 0;
+ for_each_vcpu ( d, v )
+ nvcpu++;
+
+ if ( !nvcpu )
+ return -EINVAL;
+
d->xenoprof = xzalloc(struct xenoprof);
if ( d->xenoprof == NULL )
{
@@ -209,10 +216,6 @@ static int alloc_xenoprof_struct(
return -ENOMEM;
}
- nvcpu = 0;
- for_each_vcpu ( d, v )
- nvcpu++;
-
bufsize = sizeof(struct xenoprof_buf);
i = sizeof(struct event_log);
#ifdef CONFIG_COMPAT

View File

@ -9,10 +9,11 @@ Signed-off-by: Ross Philipson <ross.philipson@citrix.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Committed-by: Ian Campbell <ian.campbell@citrix.com>
diff -r 71c15ae09983 -r 3124ab7855fd tools/libxl/libxl_dom.c
--- a/tools/libxl/libxl_dom.c Fri Feb 15 13:32:15 2013 +0000
+++ b/tools/libxl/libxl_dom.c Fri Feb 15 13:32:16 2013 +0000
@@ -542,17 +542,24 @@ int libxl__build_hvm(libxl__gc *gc, uint
Index: xen-4.2.2-testing/tools/libxl/libxl_dom.c
===================================================================
--- xen-4.2.2-testing.orig/tools/libxl/libxl_dom.c
+++ xen-4.2.2-testing/tools/libxl/libxl_dom.c
@@ -546,17 +546,24 @@ int libxl__build_hvm(libxl__gc *gc, uint
libxl__domain_build_state *state)
{
libxl_ctx *ctx = libxl__gc_owner(gc);

View File

@ -25,10 +25,10 @@ Signed-off-by: Ross Philipson <ross.philipson@citrix.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Committed-by: Ian Campbell <ian.campbell@citrix.com>
Index: xen-4.2.1-testing/docs/man/xl.cfg.pod.5
Index: xen-4.2.2-testing/docs/man/xl.cfg.pod.5
===================================================================
--- xen-4.2.1-testing.orig/docs/man/xl.cfg.pod.5
+++ xen-4.2.1-testing/docs/man/xl.cfg.pod.5
--- xen-4.2.2-testing.orig/docs/man/xl.cfg.pod.5
+++ xen-4.2.2-testing/docs/man/xl.cfg.pod.5
@@ -637,6 +637,25 @@ of Xen) within a Xen guest or to support
which uses hardware virtualisation extensions (e.g. Windows XP
compatibility mode on more modern Windows OS).
@ -55,10 +55,10 @@ Index: xen-4.2.1-testing/docs/man/xl.cfg.pod.5
=back
=head3 Guest Virtual Time Controls
Index: xen-4.2.1-testing/tools/libxl/libxl.h
Index: xen-4.2.2-testing/tools/libxl/libxl.h
===================================================================
--- xen-4.2.1-testing.orig/tools/libxl/libxl.h
+++ xen-4.2.1-testing/tools/libxl/libxl.h
--- xen-4.2.2-testing.orig/tools/libxl/libxl.h
+++ xen-4.2.2-testing/tools/libxl/libxl.h
@@ -68,6 +68,13 @@
*/
@ -73,11 +73,11 @@ Index: xen-4.2.1-testing/tools/libxl/libxl.h
* libxl ABI compatibility
*
* The only guarantee which libxl makes regarding ABI compatibility
Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
Index: xen-4.2.2-testing/tools/libxl/libxl_dom.c
===================================================================
--- xen-4.2.1-testing.orig/tools/libxl/libxl_dom.c
+++ xen-4.2.1-testing/tools/libxl/libxl_dom.c
@@ -21,6 +21,7 @@
--- xen-4.2.2-testing.orig/tools/libxl/libxl_dom.c
+++ xen-4.2.2-testing/tools/libxl/libxl_dom.c
@@ -22,6 +22,7 @@
#include <xc_dom.h>
#include <xen/hvm/hvm_info_table.h>
@ -85,7 +85,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
libxl_domain_type libxl__domain_type(libxl__gc *gc, uint32_t domid)
{
@@ -510,11 +511,61 @@ static int hvm_build_set_params(xc_inter
@@ -514,11 +515,61 @@ static int hvm_build_set_params(xc_inter
return 0;
}
@ -149,7 +149,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
if (info->u.hvm.firmware)
firmware = info->u.hvm.firmware;
@@ -528,13 +579,52 @@ static const char *libxl__domain_firmwar
@@ -532,13 +583,52 @@ static const char *libxl__domain_firmwar
firmware = "hvmloader";
break;
default:
@ -206,7 +206,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
}
int libxl__build_hvm(libxl__gc *gc, uint32_t domid,
@@ -544,10 +634,6 @@ int libxl__build_hvm(libxl__gc *gc, uint
@@ -548,10 +638,6 @@ int libxl__build_hvm(libxl__gc *gc, uint
libxl_ctx *ctx = libxl__gc_owner(gc);
struct xc_hvm_build_args args = {};
int ret, rc = ERROR_FAIL;
@ -217,7 +217,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
memset(&args, 0, sizeof(struct xc_hvm_build_args));
/* The params from the configuration file are in Mb, which are then
@@ -557,22 +643,34 @@ int libxl__build_hvm(libxl__gc *gc, uint
@@ -561,22 +647,34 @@ int libxl__build_hvm(libxl__gc *gc, uint
*/
args.mem_size = (uint64_t)(info->max_memkb - info->video_memkb) << 10;
args.mem_target = (uint64_t)(info->target_memkb - info->video_memkb) << 10;
@ -256,7 +256,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
out:
return rc;
}
@@ -634,7 +732,7 @@ int libxl__toolstack_restore(uint32_t do
@@ -638,7 +736,7 @@ int libxl__toolstack_restore(uint32_t do
memcpy(&count, ptr, sizeof(count));
ptr += sizeof(count);
@ -265,7 +265,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
if (size < sizeof(version) + sizeof(count) +
count * (sizeof(struct libxl__physmap_info))) {
LIBXL__LOG(ctx, LIBXL__LOG_ERROR, "wrong size");
@@ -809,7 +907,7 @@ static void switch_logdirty_xswatch(libx
@@ -852,7 +950,7 @@ static void switch_logdirty_xswatch(libx
rc = libxl__xs_rm_checked(gc, t, lds->ret_path);
if (rc) goto out;
@ -274,7 +274,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
if (!rc) break;
if (rc<0) goto out;
}
@@ -1281,7 +1379,7 @@ void libxl__xc_domain_save_done(libxl__e
@@ -1324,7 +1422,7 @@ void libxl__xc_domain_save_done(libxl__e
if (type == LIBXL_DOMAIN_TYPE_HVM) {
rc = libxl__domain_suspend_device_model(gc, dss);
if (rc) goto out;
@ -283,10 +283,10 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
libxl__domain_save_device_model(egc, dss, domain_suspend_done);
return;
}
Index: xen-4.2.1-testing/tools/libxl/libxl_types.idl
Index: xen-4.2.2-testing/tools/libxl/libxl_types.idl
===================================================================
--- xen-4.2.1-testing.orig/tools/libxl/libxl_types.idl
+++ xen-4.2.1-testing/tools/libxl/libxl_types.idl
--- xen-4.2.2-testing.orig/tools/libxl/libxl_types.idl
+++ xen-4.2.2-testing/tools/libxl/libxl_types.idl
@@ -301,6 +301,8 @@ libxl_domain_build_info = Struct("domain
("vpt_align", libxl_defbool),
("timer_mode", libxl_timer_mode),
@ -296,10 +296,10 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_types.idl
("nographic", libxl_defbool),
("vga", libxl_vga_interface_info),
("vnc", libxl_vnc_info),
Index: xen-4.2.1-testing/tools/libxl/xl_cmdimpl.c
Index: xen-4.2.2-testing/tools/libxl/xl_cmdimpl.c
===================================================================
--- xen-4.2.1-testing.orig/tools/libxl/xl_cmdimpl.c
+++ xen-4.2.1-testing/tools/libxl/xl_cmdimpl.c
--- xen-4.2.2-testing.orig/tools/libxl/xl_cmdimpl.c
+++ xen-4.2.2-testing/tools/libxl/xl_cmdimpl.c
@@ -863,6 +863,11 @@ static void parse_config_data(const char
}

View File

@ -9,11 +9,11 @@ Signed-off-by: Ross Philipson <ross.philipson@citrix.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Committed-by: Ian Campbell <ian.campbell@citrix.com>
Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
Index: xen-4.2.2-testing/tools/libxl/libxl_dom.c
===================================================================
--- xen-4.2.1-testing.orig/tools/libxl/libxl_dom.c
+++ xen-4.2.1-testing/tools/libxl/libxl_dom.c
@@ -31,8 +31,7 @@ libxl_domain_type libxl__domain_type(lib
--- xen-4.2.2-testing.orig/tools/libxl/libxl_dom.c
+++ xen-4.2.2-testing/tools/libxl/libxl_dom.c
@@ -32,8 +32,7 @@ libxl_domain_type libxl__domain_type(lib
ret = xc_domain_getinfolist(ctx->xch, domid, 1, &info);
if (ret != 1 || info.domain != domid) {
@ -23,7 +23,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
return LIBXL_DOMAIN_TYPE_INVALID;
}
if (info.flags & XEN_DOMINF_hvm_guest)
@@ -313,20 +312,19 @@ int libxl__build_post(libxl__gc *gc, uin
@@ -317,20 +316,19 @@ int libxl__build_post(libxl__gc *gc, uin
ents = libxl__calloc(gc, 12 + (info->max_vcpus * 2) + 2, sizeof(char *));
ents[0] = "memory/static-max";
@ -51,7 +51,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
ents[12+(i*2)+1] = libxl_bitmap_test(&info->avail_vcpus, i)
? "online" : "offline";
}
@@ -335,7 +333,7 @@ int libxl__build_post(libxl__gc *gc, uin
@@ -339,7 +337,7 @@ int libxl__build_post(libxl__gc *gc, uin
if (info->type == LIBXL_DOMAIN_TYPE_HVM) {
hvm_ents = libxl__calloc(gc, 3, sizeof(char *));
hvm_ents[0] = "hvmloader/generation-id-address";
@ -60,7 +60,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
}
dom_path = libxl__xs_get_dompath(gc, domid);
@@ -343,7 +341,7 @@ int libxl__build_post(libxl__gc *gc, uin
@@ -347,7 +345,7 @@ int libxl__build_post(libxl__gc *gc, uin
return ERROR_FAIL;
}
@ -69,7 +69,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
retry_transaction:
t = xs_transaction_start(ctx->xsh);
@@ -374,7 +372,7 @@ int libxl__build_pv(libxl__gc *gc, uint3
@@ -378,7 +376,7 @@ int libxl__build_pv(libxl__gc *gc, uint3
dom = xc_dom_allocate(ctx->xch, state->pv_cmdline, info->u.pv.features);
if (!dom) {
@ -78,7 +78,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
return ERROR_FAIL;
}
@@ -384,13 +382,13 @@ int libxl__build_pv(libxl__gc *gc, uint3
@@ -388,13 +386,13 @@ int libxl__build_pv(libxl__gc *gc, uint3
state->pv_kernel.data,
state->pv_kernel.size);
if ( ret != 0) {
@ -94,7 +94,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
goto out;
}
}
@@ -398,12 +396,12 @@ int libxl__build_pv(libxl__gc *gc, uint3
@@ -402,12 +400,12 @@ int libxl__build_pv(libxl__gc *gc, uint3
if ( state->pv_ramdisk.path && strlen(state->pv_ramdisk.path) ) {
if (state->pv_ramdisk.mapped) {
if ( (ret = xc_dom_ramdisk_mem(dom, state->pv_ramdisk.data, state->pv_ramdisk.size)) != 0 ) {
@ -109,7 +109,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
goto out;
}
}
@@ -416,31 +414,31 @@ int libxl__build_pv(libxl__gc *gc, uint3
@@ -420,31 +418,31 @@ int libxl__build_pv(libxl__gc *gc, uint3
dom->xenstore_domid = state->store_domid;
if ( (ret = xc_dom_boot_xen_init(dom, ctx->xch, domid)) != 0 ) {
@ -148,7 +148,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
goto out;
}
@@ -679,8 +677,7 @@ int libxl__qemu_traditional_cmd(libxl__g
@@ -683,8 +681,7 @@ int libxl__qemu_traditional_cmd(libxl__g
const char *cmd)
{
char *path = NULL;
@ -158,7 +158,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
return libxl__xs_write(gc, XBT_NULL, path, "%s", cmd);
}
@@ -697,8 +694,7 @@ struct libxl__physmap_info {
@@ -701,8 +698,7 @@ struct libxl__physmap_info {
static inline char *restore_helper(libxl__gc *gc, uint32_t domid,
uint64_t phys_offset, char *node)
{
@ -168,7 +168,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
domid, phys_offset, node);
}
@@ -708,7 +704,6 @@ int libxl__toolstack_restore(uint32_t do
@@ -712,7 +708,6 @@ int libxl__toolstack_restore(uint32_t do
libxl__save_helper_state *shs = user;
libxl__domain_create_state *dcs = CONTAINER_OF(shs, *dcs, shs);
STATE_AO_GC(dcs->ao);
@ -176,7 +176,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
int i, ret;
const uint8_t *ptr = buf;
uint32_t count = 0, version = 0;
@@ -718,7 +713,7 @@ int libxl__toolstack_restore(uint32_t do
@@ -722,7 +717,7 @@ int libxl__toolstack_restore(uint32_t do
LOG(DEBUG,"domain=%"PRIu32" toolstack data size=%"PRIu32, domid, size);
if (size < sizeof(version) + sizeof(count)) {
@ -185,7 +185,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
return -1;
}
@@ -726,7 +721,7 @@ int libxl__toolstack_restore(uint32_t do
@@ -730,7 +725,7 @@ int libxl__toolstack_restore(uint32_t do
ptr += sizeof(version);
if (version != TOOLSTACK_SAVE_VERSION) {
@ -194,7 +194,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
return -1;
}
@@ -735,7 +730,7 @@ int libxl__toolstack_restore(uint32_t do
@@ -739,7 +734,7 @@ int libxl__toolstack_restore(uint32_t do
if (size < sizeof(version) + sizeof(count) +
count * (sizeof(struct libxl__physmap_info))) {
@ -203,7 +203,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
return -1;
}
@@ -945,15 +940,13 @@ static void switch_logdirty_done(libxl__
@@ -988,15 +983,13 @@ static void switch_logdirty_done(libxl__
int libxl__domain_suspend_device_model(libxl__gc *gc,
libxl__domain_suspend_state *dss)
{
@ -220,7 +220,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
libxl__qemu_traditional_cmd(gc, domid, "save");
libxl__wait_for_device_model(gc, domid, "paused", NULL, NULL, NULL);
break;
@@ -1129,8 +1122,7 @@ int libxl__domain_suspend_common_callbac
@@ -1172,8 +1165,7 @@ int libxl__domain_suspend_common_callbac
static inline char *physmap_path(libxl__gc *gc, uint32_t domid,
char *phys_offset, char *node)
{
@ -230,7 +230,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
domid, phys_offset, node);
}
@@ -1147,7 +1139,7 @@ int libxl__toolstack_save(uint32_t domid
@@ -1190,7 +1182,7 @@ int libxl__toolstack_save(uint32_t domid
char **entries = NULL;
struct libxl__physmap_info *pi;
@ -239,7 +239,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
"/local/domain/0/device-model/%d/physmap", domid), &num);
count = num;
@@ -1288,7 +1280,7 @@ void libxl__domain_suspend(libxl__egc *e
@@ -1331,7 +1323,7 @@ void libxl__domain_suspend(libxl__egc *e
char *path;
char *addr;
@ -248,7 +248,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
libxl__xs_get_dompath(gc, domid));
addr = libxl__xs_read(gc, XBT_NULL, path);
@@ -1502,10 +1494,7 @@ static void domain_suspend_done(libxl__e
@@ -1545,10 +1537,7 @@ static void domain_suspend_done(libxl__e
char *libxl__uuid2string(libxl__gc *gc, const libxl_uuid uuid)
{
@ -260,7 +260,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
}
static const char *userdata_path(libxl__gc *gc, uint32_t domid,
@@ -1513,34 +1502,27 @@ static const char *userdata_path(libxl__
@@ -1556,34 +1545,27 @@ static const char *userdata_path(libxl__
const char *wh)
{
libxl_ctx *ctx = libxl__gc_owner(gc);
@ -301,7 +301,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
return errno;
}
return 0;
@@ -1548,7 +1530,6 @@ static int userdata_delete(libxl__gc *gc
@@ -1591,7 +1573,6 @@ static int userdata_delete(libxl__gc *gc
void libxl__userdata_destroyall(libxl__gc *gc, uint32_t domid)
{
@ -309,7 +309,7 @@ Index: xen-4.2.1-testing/tools/libxl/libxl_dom.c
const char *pattern;
glob_t gl;
int r, i;
@@ -1564,7 +1545,7 @@ void libxl__userdata_destroyall(libxl__g
@@ -1607,7 +1588,7 @@ void libxl__userdata_destroyall(libxl__g
if (r == GLOB_NOMATCH)
goto out;
if (r)

View File

@ -18,7 +18,7 @@ Committed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/hvm/vlapic.c
+++ b/xen/arch/x86/hvm/vlapic.c
@@ -1198,6 +1198,9 @@ static int lapic_load_regs(struct domain
@@ -1194,6 +1194,9 @@ static int lapic_load_regs(struct domain
if ( hvm_load_entry(LAPIC_REGS, h, s->regs) != 0 )
return -EINVAL;
@ -43,7 +43,7 @@ Committed-by: Jan Beulich <jbeulich@suse.com>
if (v->arch.hvm_vmx.eoi_exitmap_changed) {
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -1520,6 +1520,29 @@ static int vmx_virtual_intr_delivery_ena
@@ -1523,6 +1523,29 @@ static int vmx_virtual_intr_delivery_ena
return cpu_has_vmx_virtual_intr_delivery;
}
@ -73,7 +73,7 @@ Committed-by: Jan Beulich <jbeulich@suse.com>
static struct hvm_function_table __read_mostly vmx_function_table = {
.name = "VMX",
.cpu_up_prepare = vmx_cpu_up_prepare,
@@ -1568,7 +1591,8 @@ static struct hvm_function_table __read_
@@ -1571,7 +1594,8 @@ static struct hvm_function_table __read_
.nhvm_intr_blocked = nvmx_intr_blocked,
.nhvm_domain_relinquish_resources = nvmx_domain_relinquish_resources,
.update_eoi_exit_bitmap = vmx_update_eoi_exit_bitmap,

View File

@ -18,8 +18,10 @@ Acked-by: Eddie Dong <eddie.dong@intel.com>
Acked-by: Jun Nakajima <jun.nakajima@intel.com>
Committed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
Index: xen-4.2.2-testing/xen/arch/x86/hvm/vmx/vmcs.c
===================================================================
--- xen-4.2.2-testing.orig/xen/arch/x86/hvm/vmx/vmcs.c
+++ xen-4.2.2-testing/xen/arch/x86/hvm/vmx/vmcs.c
@@ -194,7 +194,8 @@ static int vmx_init_vmcs_config(void)
*/
if ( _vmx_cpu_based_exec_control & CPU_BASED_TPR_SHADOW )
@ -128,9 +130,11 @@ Committed-by: Jan Beulich <jbeulich@suse.com>
}
/* I/O access bitmap. */
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2009,18 +2009,63 @@ static void vmx_install_vlapic_mapping(s
Index: xen-4.2.2-testing/xen/arch/x86/hvm/vmx/vmx.c
===================================================================
--- xen-4.2.2-testing.orig/xen/arch/x86/hvm/vmx/vmx.c
+++ xen-4.2.2-testing/xen/arch/x86/hvm/vmx/vmx.c
@@ -2012,18 +2012,63 @@ static void vmx_install_vlapic_mapping(s
void vmx_vlapic_msr_changed(struct vcpu *v)
{
@ -198,8 +202,10 @@ Committed-by: Jan Beulich <jbeulich@suse.com>
vmx_update_secondary_exec_control(v);
vmx_vmcs_exit(v);
}
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
Index: xen-4.2.2-testing/xen/include/asm-x86/hvm/vmx/vmcs.h
===================================================================
--- xen-4.2.2-testing.orig/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ xen-4.2.2-testing/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -182,6 +182,7 @@ extern u32 vmx_vmentry_control;
#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
@ -225,8 +231,10 @@ Committed-by: Jan Beulich <jbeulich@suse.com>
int vmx_read_guest_msr(u32 msr, u64 *val);
int vmx_write_guest_msr(u32 msr, u64 val);
int vmx_add_guest_msr(u32 msr);
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
Index: xen-4.2.2-testing/xen/include/asm-x86/msr-index.h
===================================================================
--- xen-4.2.2-testing.orig/xen/include/asm-x86/msr-index.h
+++ xen-4.2.2-testing/xen/include/asm-x86/msr-index.h
@@ -295,7 +295,10 @@
#define MSR_IA32_APICBASE_BASE (0xfffff<<12)
#define MSR_IA32_APICBASE_MSR 0x800

View File

@ -1,25 +0,0 @@
# HG changeset patch
# User Jan Beulich <jbeulich@suse.com>
# Date 1361176655 -3600
# Node ID 57e67af5281a6b66cf71dfa812e4335930684fd6
# Parent 45d59b822ed187c535b127679e32853b148ed411
AMD IOMMU: don't BUG() when we don't have to
find_iommu_for_device() can easily return NULL instead, as all of its
callers are prepared for that.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -32,8 +32,8 @@ struct amd_iommu *find_iommu_for_device(
{
struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(seg);
- BUG_ON ( bdf >= ivrs_bdf_entries );
- return ivrs_mappings ? ivrs_mappings[bdf].iommu : NULL;
+ return ivrs_mappings && bdf < ivrs_bdf_entries ? ivrs_mappings[bdf].iommu
+ : NULL;
}
/*

View File

@ -1,50 +0,0 @@
# Commit a15d87475ed95840dba693ab0a56d0b48a215cbc
# Date 2013-02-21 15:16:20 +0000
# Author Tim Deegan <tim@xen.org>
# Committer Tim Deegan <tim@xen.org>
x86/mm: Take the p2m lock even in shadow mode.
The reworking of p2m lookups to use get_gfn()/put_gfn() left the
shadow code not taking the p2m lock, even in cases where the p2m would
be updated (i.e. PoD).
In many cases, shadow code doesn't need the exclusion that
get_gfn()/put_gfn() provides, as it has its own interlocks against p2m
updates, but this is taking things too far, and can lead to crashes in
the PoD code.
Now that most shadow-code p2m lookups are done with explicitly
unlocked accessors, or with the get_page_from_gfn() accessor, which is
often lock-free, we can just turn this locking on.
The remaining locked lookups are in sh_page_fault() (in a path that's
almost always already serializing on the paging lock), and in
emulate_map_dest() (which can probably be updated to use
get_page_from_gfn()). They're not addressed here but may be in a
follow-up patch.
Signed-off-by: Tim Deegan <tim@xen.org>
Acked-by: Andres Lagar-Cavilla <andres@lagarcavilla.org>
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -163,8 +163,7 @@ mfn_t __get_gfn_type_access(struct p2m_d
return _mfn(gfn);
}
- /* For now only perform locking on hap domains */
- if ( locked && (hap_enabled(p2m->domain)) )
+ if ( locked )
/* Grab the lock here, don't release until put_gfn */
gfn_lock(p2m, gfn, 0);
@@ -197,8 +196,7 @@ mfn_t __get_gfn_type_access(struct p2m_d
void __put_gfn(struct p2m_domain *p2m, unsigned long gfn)
{
- if ( !p2m || !paging_mode_translate(p2m->domain)
- || !hap_enabled(p2m->domain) )
+ if ( !p2m || !paging_mode_translate(p2m->domain) )
/* Nothing to do in this case */
return;

View File

@ -1,57 +0,0 @@
# Commit 17281aea1a9a10f1ee165c6e6a2921a67b7b1df2
# Date 2013-02-22 11:21:38 +0100
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/nhvm: properly clean up after failure to set up all vCPU-s
Otherwise we may leak memory when setting up nHVM fails half way.
This implies that the individual destroy functions will have to remain
capable (in the VMX case they first need to be made so, following
26486:7648ef657fe7 and 26489:83a3fa9c8434) of being called for a vCPU
that the corresponding init function was never run on.
Once at it, also remove a redundant check from the corresponding
parameter validation code.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
Tested-by: Olaf Hering <olaf@aepfle.de>
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3941,18 +3941,20 @@ long do_hvm_op(unsigned long op, XEN_GUE
#else
if ( a.value > 1 )
rc = -EINVAL;
- if ( !is_hvm_domain(d) )
- rc = -EINVAL;
/* Remove the check below once we have
* shadow-on-shadow.
*/
if ( cpu_has_svm && !paging_mode_hap(d) && a.value )
rc = -EINVAL;
/* Set up NHVM state for any vcpus that are already up */
- if ( !d->arch.hvm_domain.params[HVM_PARAM_NESTEDHVM] )
+ if ( a.value &&
+ !d->arch.hvm_domain.params[HVM_PARAM_NESTEDHVM] )
for_each_vcpu(d, v)
if ( rc == 0 )
rc = nestedhvm_vcpu_initialise(v);
+ if ( !a.value || rc )
+ for_each_vcpu(d, v)
+ nestedhvm_vcpu_destroy(v);
#endif
break;
case HVM_PARAM_BUFIOREQ_EVTCHN:
--- a/xen/arch/x86/hvm/nestedhvm.c
+++ b/xen/arch/x86/hvm/nestedhvm.c
@@ -88,7 +88,7 @@ nestedhvm_vcpu_initialise(struct vcpu *v
void
nestedhvm_vcpu_destroy(struct vcpu *v)
{
- if ( nestedhvm_enabled(v->domain) && hvm_funcs.nhvm_vcpu_destroy )
+ if ( hvm_funcs.nhvm_vcpu_destroy )
hvm_funcs.nhvm_vcpu_destroy(v);
}

View File

@ -1,158 +0,0 @@
# Commit 992fdf6f46252a459c6b1b8d971b2c71f01460f8
# Date 2013-02-22 11:56:54 +0100
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
honor ACPI v4 FADT flags
- force use of physical APIC mode if indicated so (as we don't support
xAPIC cluster mode, the respective flag is taken to force physical
mode too)
- don't use MSI if indicated so (implies no IOMMU)
Both can be overridden on the command line, for the MSI case this at
once adds a new command line option allowing to turn off PCI MSI (IOMMU
and HPET are unaffected by this).
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Keir Fraser <keir@xen.org>
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -602,6 +602,13 @@ limit is ignored by Xen.
Specify if the MMConfig space should be enabled.
+### msi
+> `= <boolean>`
+
+> Default: `true`
+
+Force Xen to (not) use PCI-MSI, even if ACPI FADT says otherwise.
+
### nmi
> `= ignore | dom0 | fatal`
--- a/xen/arch/x86/genapic/bigsmp.c
+++ b/xen/arch/x86/genapic/bigsmp.c
@@ -40,7 +40,14 @@ static struct dmi_system_id __initdata b
static __init int probe_bigsmp(void)
{
- if (!def_to_bigsmp)
+ /*
+ * We don't implement cluster mode, so force use of
+ * physical mode in both cases.
+ */
+ if (acpi_gbl_FADT.flags &
+ (ACPI_FADT_APIC_CLUSTER | ACPI_FADT_APIC_PHYSICAL))
+ def_to_bigsmp = 1;
+ else if (!def_to_bigsmp)
dmi_check_system(bigsmp_dmi_table);
return def_to_bigsmp;
}
--- a/xen/arch/x86/genapic/x2apic.c
+++ b/xen/arch/x86/genapic/x2apic.c
@@ -29,9 +29,6 @@
#include <xen/smp.h>
#include <asm/mach-default/mach_mpparse.h>
-static bool_t __initdata x2apic_phys; /* By default we use logical cluster mode. */
-boolean_param("x2apic_phys", x2apic_phys);
-
static void init_apic_ldr_x2apic_phys(void)
{
}
@@ -121,8 +118,14 @@ static const struct genapic apic_x2apic_
.send_IPI_self = send_IPI_self_x2apic
};
+static s8 __initdata x2apic_phys = -1; /* By default we use logical cluster mode. */
+boolean_param("x2apic_phys", x2apic_phys);
+
const struct genapic *__init apic_x2apic_probe(void)
{
+ if ( x2apic_phys < 0 )
+ x2apic_phys = !!(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL);
+
return x2apic_phys ? &apic_x2apic_phys : &apic_x2apic_cluster;
}
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -31,6 +31,9 @@
#include <xen/iommu.h>
#include <xsm/xsm.h>
+static s8 __read_mostly use_msi = -1;
+boolean_param("msi", use_msi);
+
/* bitmap indicate which fixed map is free */
DEFINE_SPINLOCK(msix_fixmap_lock);
DECLARE_BITMAP(msix_fixmap_pages, FIX_MSIX_MAX_PAGES);
@@ -958,6 +961,9 @@ int pci_enable_msi(struct msi_info *msi,
{
ASSERT(spin_is_locked(&pcidevs_lock));
+ if ( !use_msi )
+ return -EPERM;
+
return msi->table_base ? __pci_enable_msix(msi, desc) :
__pci_enable_msi(msi, desc);
}
@@ -1003,7 +1009,10 @@ int pci_restore_msi_state(struct pci_dev
ASSERT(spin_is_locked(&pcidevs_lock));
- if (!pdev)
+ if ( !use_msi )
+ return -EOPNOTSUPP;
+
+ if ( !pdev )
return -EINVAL;
ret = xsm_resource_setup_pci((pdev->seg << 16) | (pdev->bus << 8) | pdev->devfn);
@@ -1062,7 +1071,7 @@ unsigned int pci_msix_get_table_len(stru
func = PCI_FUNC(pdev->devfn);
pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
- if ( !pos )
+ if ( !pos || !use_msi )
return 0;
control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
@@ -1135,6 +1144,9 @@ static struct keyhandler dump_msi_keyhan
static int __init msi_setup_keyhandler(void)
{
+ if ( use_msi < 0 )
+ use_msi = !(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_MSI);
+
register_keyhandler('M', &dump_msi_keyhandler);
return 0;
}
--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -1066,5 +1066,8 @@ int __init amd_iommu_get_ivrs_dev_entrie
int __init amd_iommu_update_ivrs_mapping_acpi(void)
{
+ if ( unlikely(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_MSI) )
+ return -EPERM;
+
return acpi_table_parse(ACPI_SIG_IVRS, parse_ivrs_table);
}
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -2119,6 +2119,12 @@ int __init intel_vtd_setup(void)
if ( list_empty(&acpi_drhd_units) )
return -ENODEV;
+ if ( unlikely(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_MSI) )
+ {
+ ret = -EPERM;
+ goto error;
+ }
+
platform_quirks_init();
/* We enable the following features only if they are supported by all VT-d

View File

@ -1,22 +0,0 @@
# Commit c40e24a8ef74f9d0ee59dd9b8ca890be08b0b874
# Date 2013-02-25 12:44:25 +0100
# Author Xi Wang <xi@mit.edu>
# Committer Jan Beulich <jbeulich@suse.com>
x86: fix null pointer dereference in intel_get_extended_msrs()
`memset(&mc_ext, 0, ...)' leads to a buffer overflow and a subsequent
null pointer dereference. Replace `&mc_ext' with `mc_ext'.
Signed-off-by: Xi Wang <xi@mit.edu>
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
@@ -534,7 +534,7 @@ intel_get_extended_msrs(struct mcinfo_gl
}
/* this function will called when CAP(9).MCG_EXT_P = 1 */
- memset(&mc_ext, 0, sizeof(struct mcinfo_extended));
+ memset(mc_ext, 0, sizeof(*mc_ext));
mc_ext->common.type = MC_TYPE_EXTENDED;
mc_ext->common.size = sizeof(struct mcinfo_extended);

View File

@ -1,73 +0,0 @@
# Commit 0f8adcb2a7183bea5063f6fffba7d7e1aa14fc84
# Date 2013-02-26 10:14:53 +0100
# Author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
# Committer Jan Beulich <jbeulich@suse.com>
IOMMU, AMD Family15h Model10-1Fh erratum 746 Workaround
The IOMMU may stop processing page translations due to a perceived lack
of credits for writing upstream peripheral page service request (PPR)
or event logs. If the L2B miscellaneous clock gating feature is enabled
the IOMMU does not properly register credits after the log request has
completed, leading to a potential system hang.
BIOSes are supposed to disable L2B micellaneous clock gating by setting
L2_L2B_CK_GATE_CONTROL[CKGateL2BMiscDisable](D0F2xF4_x90[2]) = 1b. This
patch corrects that for those which do not enable this workaround.
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -795,6 +795,42 @@ static int __init set_iommu_interrupt_ha
return irq;
}
+/*
+ * Family15h Model 10h-1fh erratum 746 (IOMMU Logging May Stall Translations)
+ * Workaround:
+ * BIOS should disable L2B micellaneous clock gating by setting
+ * L2_L2B_CK_GATE_CONTROL[CKGateL2BMiscDisable](D0F2xF4_x90[2]) = 1b
+ */
+static void amd_iommu_erratum_746_workaround(struct amd_iommu *iommu)
+{
+ u32 value;
+ u8 bus = PCI_BUS(iommu->bdf);
+ u8 dev = PCI_SLOT(iommu->bdf);
+ u8 func = PCI_FUNC(iommu->bdf);
+
+ if ( (boot_cpu_data.x86 != 0x15) ||
+ (boot_cpu_data.x86_model < 0x10) ||
+ (boot_cpu_data.x86_model > 0x1f) )
+ return;
+
+ pci_conf_write32(iommu->seg, bus, dev, func, 0xf0, 0x90);
+ value = pci_conf_read32(iommu->seg, bus, dev, func, 0xf4);
+
+ if ( value & (1 << 2) )
+ return;
+
+ /* Select NB indirect register 0x90 and enable writing */
+ pci_conf_write32(iommu->seg, bus, dev, func, 0xf0, 0x90 | (1 << 8));
+
+ pci_conf_write32(iommu->seg, bus, dev, func, 0xf4, value | (1 << 2));
+ printk(XENLOG_INFO
+ "AMD-Vi: Applying erratum 746 workaround for IOMMU at %04x:%02x:%02x.%u\n",
+ iommu->seg, bus, dev, func);
+
+ /* Clear the enable writing bit */
+ pci_conf_write32(iommu->seg, bus, dev, func, 0xf0, 0x90);
+}
+
static void enable_iommu(struct amd_iommu *iommu)
{
unsigned long flags;
@@ -807,6 +843,8 @@ static void enable_iommu(struct amd_iomm
return;
}
+ amd_iommu_erratum_746_workaround(iommu);
+
register_iommu_dev_table_in_mmio_space(iommu);
register_iommu_cmd_buffer_in_mmio_space(iommu);
register_iommu_event_log_in_mmio_space(iommu);

View File

@ -1,128 +0,0 @@
# Commit 2f8c55ccefe49bb526df0eaf5fa9b7b788422208
# Date 2013-02-26 10:15:56 +0100
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86: fix CMCI injection
This fixes the wrong use of literal vector 0xF7 with an "int"
instruction (invalidated by 25113:14609be41f36) and the fact that doing
the injection via a software interrupt was never valid anyway (because
cmci_interrupt() acks the LAPIC, which does the wrong thing if the
interrupt didn't get delivered though it).
In order to do latter, the patch introduces send_IPI_self(), at once
removing two opend coded uses of "genapic" in the IRQ handling code.
Reported-by: Yongjie Ren <yongjie.ren@intel.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Tested-by: Yongjie Ren <yongjie.ren@intel.com>
Acked-by: Keir Fraser <keir@xen.org>
--- a/xen/arch/x86/cpu/mcheck/mce.c
+++ b/xen/arch/x86/cpu/mcheck/mce.c
@@ -30,6 +30,7 @@ bool_t __read_mostly mce_broadcast = 0;
bool_t is_mc_panic;
unsigned int __read_mostly nr_mce_banks;
unsigned int __read_mostly firstbank;
+uint8_t __read_mostly cmci_apic_vector;
static void intpose_init(void);
static void mcinfo_clear(struct mc_info *);
@@ -1277,12 +1278,6 @@ static void x86_mc_mceinject(void *data)
__asm__ __volatile__("int $0x12");
}
-static void x86_cmci_inject(void *data)
-{
- printk("Simulating CMCI on cpu %d\n", smp_processor_id());
- __asm__ __volatile__("int $0xf7");
-}
-
#if BITS_PER_LONG == 64
#define ID2COOKIE(id) ((mctelem_cookie_t)(id))
@@ -1568,11 +1563,15 @@ long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u
on_selected_cpus(cpumap, x86_mc_mceinject, NULL, 1);
break;
case XEN_MC_INJECT_TYPE_CMCI:
- if ( !cmci_support )
+ if ( !cmci_apic_vector )
ret = x86_mcerr(
"No CMCI supported in platform\n", -EINVAL);
else
- on_selected_cpus(cpumap, x86_cmci_inject, NULL, 1);
+ {
+ if ( cpumask_test_cpu(smp_processor_id(), cpumap) )
+ send_IPI_self(cmci_apic_vector);
+ send_IPI_mask(cpumap, cmci_apic_vector);
+ }
break;
default:
ret = x86_mcerr("Wrong mca type\n", -EINVAL);
--- a/xen/arch/x86/cpu/mcheck/mce.h
+++ b/xen/arch/x86/cpu/mcheck/mce.h
@@ -38,6 +38,8 @@ enum mcheck_type {
mcheck_intel
};
+extern uint8_t cmci_apic_vector;
+
/* Init functions */
enum mcheck_type amd_k7_mcheck_init(struct cpuinfo_x86 *c);
enum mcheck_type amd_k8_mcheck_init(struct cpuinfo_x86 *c);
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
@@ -1164,7 +1164,6 @@ static void intel_init_cmci(struct cpuin
{
u32 l, apic;
int cpu = smp_processor_id();
- static uint8_t cmci_apic_vector;
if (!mce_available(c) || !cmci_support) {
if (opt_cpu_info)
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -646,7 +646,7 @@ void irq_move_cleanup_interrupt(struct c
* to myself.
*/
if (irr & (1 << (vector % 32))) {
- genapic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
+ send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
TRACE_3D(TRC_HW_IRQ_MOVE_CLEANUP_DELAY,
irq, vector, smp_processor_id());
goto unlock;
@@ -692,7 +692,7 @@ static void send_cleanup_vector(struct i
cpumask_and(&cleanup_mask, desc->arch.old_cpu_mask, &cpu_online_map);
desc->arch.move_cleanup_count = cpumask_weight(&cleanup_mask);
- genapic->send_IPI_mask(&cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+ send_IPI_mask(&cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
desc->arch.move_in_progress = 0;
}
--- a/xen/arch/x86/smp.c
+++ b/xen/arch/x86/smp.c
@@ -43,6 +43,11 @@ void send_IPI_mask(const cpumask_t *mask
genapic->send_IPI_mask(mask, vector);
}
+void send_IPI_self(int vector)
+{
+ genapic->send_IPI_self(vector);
+}
+
/*
* Some notes on x86 processor bugs affecting SMP operation:
*
--- a/xen/include/asm-x86/smp.h
+++ b/xen/include/asm-x86/smp.h
@@ -29,7 +29,8 @@ DECLARE_PER_CPU(cpumask_var_t, cpu_core_
void smp_send_nmi_allbutself(void);
-void send_IPI_mask(const cpumask_t *mask, int vector);
+void send_IPI_mask(const cpumask_t *, int vector);
+void send_IPI_self(int vector);
extern void (*mtrr_hook) (void);

View File

@ -1,107 +0,0 @@
# Commit 7dd3b06ff031c9a8c727df16c5def2afb382101c
# Date 2013-02-28 14:00:18 +0000
# Author Tim Deegan <tim@xen.org>
# Committer Tim Deegan <tim@xen.org>
vmx: fix handling of NMI VMEXIT.
Call do_nmi() directly and explicitly re-enable NMIs rather than
raising an NMI through the APIC. Since NMIs are disabled after the
VMEXIT, the raised NMI would be blocked until the next IRET
instruction (i.e. the next real interrupt, or after scheduling a PV
guest) and in the meantime the guest will spin taking NMI VMEXITS.
Also, handle NMIs before re-enabling interrupts, since if we handle an
interrupt (and therefore IRET) before calling do_nmi(), we may end up
running the NMI handler with NMIs enabled.
Signed-off-by: Tim Deegan <tim@xen.org>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
Acked-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2421,6 +2421,13 @@ void vmx_vmexit_handler(struct cpu_user_
vector = intr_info & INTR_INFO_VECTOR_MASK;
if ( vector == TRAP_machine_check )
do_machine_check(regs);
+ if ( vector == TRAP_nmi
+ && ((intr_info & INTR_INFO_INTR_TYPE_MASK) ==
+ (X86_EVENTTYPE_NMI << 8)) )
+ {
+ do_nmi(regs);
+ enable_nmis();
+ }
break;
case EXIT_REASON_MCE_DURING_VMENTRY:
do_machine_check(regs);
@@ -2594,7 +2601,7 @@ void vmx_vmexit_handler(struct cpu_user_
(X86_EVENTTYPE_NMI << 8) )
goto exit_and_crash;
HVMTRACE_0D(NMI);
- self_nmi(); /* Real NMI, vector 2: normal processing. */
+ /* Already handled above. */
break;
case TRAP_machine_check:
HVMTRACE_0D(MCE);
--- a/xen/arch/x86/x86_32/entry.S
+++ b/xen/arch/x86/x86_32/entry.S
@@ -621,6 +621,14 @@ ENTRY(machine_check)
pushl $TRAP_machine_check<<16
jmp handle_nmi_mce
+/* Enable NMIs. No special register assumptions. All registers are preserved. */
+ENTRY(enable_nmis)
+ /* Set up stack frame */
+ pushf # EFLAGS
+ push %cs # CS
+ push $.Lret # EIP
+ iret # Disable the hardware NMI latch
+
ENTRY(setup_vm86_frame)
mov %ecx,%ds
mov %ecx,%es
@@ -634,7 +642,7 @@ ENTRY(setup_vm86_frame)
.endm
copy_vm86_words
addl $16,%esp
- ret
+.Lret: ret
.section .rodata, "a", @progbits
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -643,6 +643,22 @@ ENTRY(machine_check)
movl $TRAP_machine_check,4(%rsp)
jmp handle_ist_exception
+/* Enable NMIs. No special register assumptions. Only %rax is not preserved. */
+ENTRY(enable_nmis)
+ movq %rsp, %rax /* Grab RSP before pushing */
+
+ /* Set up stack frame */
+ pushq $0 /* SS */
+ pushq %rax /* RSP */
+ pushfq /* RFLAGS */
+ pushq $__HYPERVISOR_CS /* CS */
+ leaq 1f(%rip),%rax
+ pushq %rax /* RIP */
+
+ iretq /* Disable the hardware NMI latch */
+1:
+ retq
+
.section .rodata, "a", @progbits
ENTRY(exception_table)
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -584,6 +584,8 @@ DECLARE_TRAP_HANDLER(alignment_check);
DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
#undef DECLARE_TRAP_HANDLER
+void enable_nmis(void);
+
void syscall_enter(void);
void sysenter_entry(void);
void sysenter_eflags_saved(void);

View File

@ -1,80 +0,0 @@
# Commit 482300def7d08e773ccd2a0d978bcb9469fdd810
# Date 2013-02-28 14:56:45 +0000
# Author Juergen Gross <juergen.gross@ts.fujitsu.com>
# Committer Keir Fraser <keir@xen.org>
Avoid stale pointer when moving domain to another cpupool
When a domain is moved to another cpupool the scheduler private data pointers
in vcpu and domain structures must never point to an already freed memory
area.
While at it, simplify sched_init_vcpu() by using DOM2OP instead VCPU2OP.
Signed-off-by: Juergen Gross <juergen.gross@ts.fujitsu.com>
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -220,7 +220,7 @@ int sched_init_vcpu(struct vcpu *v, unsi
if ( v->sched_priv == NULL )
return 1;
- SCHED_OP(VCPU2OP(v), insert_vcpu, v);
+ SCHED_OP(DOM2OP(d), insert_vcpu, v);
return 0;
}
@@ -231,6 +231,9 @@ int sched_move_domain(struct domain *d,
unsigned int new_p;
void **vcpu_priv;
void *domdata;
+ void *vcpudata;
+ struct scheduler *old_ops;
+ void *old_domdata;
domdata = SCHED_OP(c->sched, alloc_domdata, d);
if ( domdata == NULL )
@@ -261,21 +264,22 @@ int sched_move_domain(struct domain *d,
domain_pause(d);
+ old_ops = DOM2OP(d);
+ old_domdata = d->sched_priv;
+
for_each_vcpu ( d, v )
{
- SCHED_OP(VCPU2OP(v), remove_vcpu, v);
- SCHED_OP(VCPU2OP(v), free_vdata, v->sched_priv);
- v->sched_priv = NULL;
+ SCHED_OP(old_ops, remove_vcpu, v);
}
- SCHED_OP(DOM2OP(d), free_domdata, d->sched_priv);
-
d->cpupool = c;
d->sched_priv = domdata;
new_p = cpumask_first(c->cpu_valid);
for_each_vcpu ( d, v )
{
+ vcpudata = v->sched_priv;
+
migrate_timer(&v->periodic_timer, new_p);
migrate_timer(&v->singleshot_timer, new_p);
migrate_timer(&v->poll_timer, new_p);
@@ -288,12 +292,16 @@ int sched_move_domain(struct domain *d,
new_p = cpumask_cycle(new_p, c->cpu_valid);
SCHED_OP(c->sched, insert_vcpu, v);
+
+ SCHED_OP(old_ops, free_vdata, vcpudata);
}
domain_update_node_affinity(d);
domain_unpause(d);
+ SCHED_OP(old_ops, free_domdata, old_domdata);
+
xfree(vcpu_priv);
return 0;

View File

@ -1,24 +0,0 @@
# Commit 53decd322157e922cac2988e07da6d39538c8033
# Date 2013-03-01 16:59:49 +0100
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
fix compat memory exchange op splitting
A shift with a negative count was erroneously used here, yielding
undefined behavior.
Reported-by: Xi Wang <xi@mit.edu>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Keir Fraser <keir@xen.org>
--- a/xen/common/compat/memory.c
+++ b/xen/common/compat/memory.c
@@ -172,7 +172,7 @@ int compat_memory_op(unsigned int cmd, X
if ( order_delta >= 0 )
nat.xchg->out.nr_extents = end_extent >> order_delta;
else
- nat.xchg->out.nr_extents = end_extent << order_delta;
+ nat.xchg->out.nr_extents = end_extent << -order_delta;
++split;
}

View File

@ -1,78 +0,0 @@
# Commit 7ffc9779aa5120c5098d938cb88f69a1dda9a0fe
# Date 2013-03-04 10:16:04 +0100
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86: make certain memory sub-ops return valid values
When a domain's shared info field "max_pfn" is zero,
domain_get_maximum_gpfn() so far returned ULONG_MAX, which
do_memory_op() in turn converted to -1 (i.e. -EPERM). Make the former
always return a sensible number (i.e. zero if the field was zero) and
have the latter no longer truncate return values.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -437,7 +437,7 @@ unsigned long domain_get_maximum_gpfn(st
if ( is_hvm_domain(d) )
return p2m_get_hostp2m(d)->max_mapped_pfn;
/* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
- return arch_get_max_pfn(d) - 1;
+ return (arch_get_max_pfn(d) ?: 1) - 1;
}
void share_xen_page_with_guest(
--- a/xen/common/compat/memory.c
+++ b/xen/common/compat/memory.c
@@ -15,7 +15,8 @@ CHECK_TYPE(domid);
int compat_memory_op(unsigned int cmd, XEN_GUEST_HANDLE(void) compat)
{
- int rc, split, op = cmd & MEMOP_CMD_MASK;
+ int split, op = cmd & MEMOP_CMD_MASK;
+ long rc;
unsigned int start_extent = cmd >> MEMOP_EXTENT_SHIFT;
do
@@ -204,7 +205,7 @@ int compat_memory_op(unsigned int cmd, X
rc = do_memory_op(cmd, nat.hnd);
if ( rc < 0 )
- return rc;
+ break;
cmd = 0;
if ( hypercall_xlat_continuation(&cmd, 0x02, nat.hnd, compat) )
@@ -318,5 +319,11 @@ int compat_memory_op(unsigned int cmd, X
__HYPERVISOR_memory_op, "ih", cmd, compat);
} while ( split > 0 );
+ if ( unlikely(rc > INT_MAX) )
+ return INT_MAX;
+
+ if ( unlikely(rc < INT_MIN) )
+ return INT_MIN;
+
return rc;
}
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -532,14 +532,13 @@ static long memory_exchange(XEN_GUEST_HA
long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE(void) arg)
{
struct domain *d;
- int rc, op;
+ long rc;
unsigned int address_bits;
unsigned long start_extent;
struct xen_memory_reservation reservation;
struct memop_args args;
domid_t domid;
-
- op = cmd & MEMOP_CMD_MASK;
+ int op = cmd & MEMOP_CMD_MASK;
switch ( op )
{

View File

@ -1,58 +0,0 @@
# Commit e6a6fd63652814e5c36a0016c082032f798ced1f
# Date 2013-03-04 10:17:52 +0100
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
SEDF: avoid gathering vCPU-s on pCPU0
The introduction of vcpu_force_reschedule() in 14320:215b799fa181 was
incompatible with the SEDF scheduler: Any vCPU using
VCPUOP_stop_periodic_timer (e.g. any vCPU of half way modern PV Linux
guests) ends up on pCPU0 after that call. Obviously, running all PV
guests' (and namely Dom0's) vCPU-s on pCPU0 causes problems for those
guests rather sooner than later.
So the main thing that was clearly wrong (and bogus from the beginning)
was the use of cpumask_first() in sedf_pick_cpu(). It is being replaced
by a construct that prefers to put back the vCPU on the pCPU that it
got launched on.
However, there's one more glitch: When reducing the affinity of a vCPU
temporarily, and then widening it again to a set that includes the pCPU
that the vCPU was last running on, the generic scheduler code would not
force a migration of that vCPU, and hence it would forever stay on the
pCPU it last ran on. Since that can again create a load imbalance, the
SEDF scheduler wants a migration to happen regardless of it being
apparently unnecessary.
Of course, an alternative to checking for SEDF explicitly in
vcpu_set_affinity() would be to introduce a flags field in struct
scheduler, and have SEDF set a "always-migrate-on-affinity-change"
flag.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Keir Fraser <keir@xen.org>
--- a/xen/common/sched_sedf.c
+++ b/xen/common/sched_sedf.c
@@ -396,7 +396,8 @@ static int sedf_pick_cpu(const struct sc
online = cpupool_scheduler_cpumask(v->domain->cpupool);
cpumask_and(&online_affinity, v->cpu_affinity, online);
- return cpumask_first(&online_affinity);
+ return cpumask_cycle(v->vcpu_id % cpumask_weight(&online_affinity) - 1,
+ &online_affinity);
}
/*
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -611,7 +611,8 @@ int vcpu_set_affinity(struct vcpu *v, co
vcpu_schedule_lock_irq(v);
cpumask_copy(v->cpu_affinity, affinity);
- if ( !cpumask_test_cpu(v->processor, v->cpu_affinity) )
+ if ( VCPU2OP(v)->sched_id == XEN_SCHEDULER_SEDF ||
+ !cpumask_test_cpu(v->processor, v->cpu_affinity) )
set_bit(_VPF_migrating, &v->pause_flags);
vcpu_schedule_unlock_irq(v);

View File

@ -1,134 +0,0 @@
# Commit d463b005bbd6475ed930a302821efe239e1b2cf9
# Date 2013-03-04 10:19:34 +0100
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86: defer processing events on the NMI exit path
Otherwise, we may end up in the scheduler, keeping NMIs masked for a
possibly unbounded period of time (until whenever the next IRET gets
executed). Enforce timely event processing by sending a self IPI.
Of course it's open for discussion whether to always use the straight
exit path from handle_ist_exception.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Keir Fraser <keir@xen.org>
--- a/xen/arch/x86/x86_32/entry.S
+++ b/xen/arch/x86/x86_32/entry.S
@@ -60,6 +60,7 @@
#include <asm/apicdef.h>
#include <asm/page.h>
#include <public/xen.h>
+#include <irq_vectors.h>
ALIGN
restore_all_guest:
@@ -561,6 +562,8 @@ ENTRY(early_page_fault)
jmp restore_all_xen
.popsection
+ENTRY(nmi)
+ pushl $TRAP_nmi<<16
handle_nmi_mce:
#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
# NMI/MCE entry protocol is incompatible with guest kernel in ring 0.
@@ -581,7 +584,24 @@ handle_nmi_mce:
* cases we have put guest DS/ES on the guest stack frame, which will
* be detected by SAVE_ALL(), or we have rolled back restore_guest.
*/
- jmp ret_from_intr
+ cmpb $TRAP_nmi,UREGS_entry_vector(%esp)
+ jne ret_from_intr
+ /* We want to get straight to the IRET on the NMI exit path. */
+ GET_CURRENT(%ebx)
+ movl UREGS_eflags(%esp),%eax
+ movb UREGS_cs(%esp),%al
+ testl $(3|X86_EFLAGS_VM),%eax
+ jz restore_all_xen
+ /* Send an IPI to ourselves to cover for the lack of event checking. */
+ movl VCPU_processor(%ebx),%eax
+ shll $IRQSTAT_shift,%eax
+ cmpl $0,irq_stat(%eax)
+ je restore_all_guest
+ pushl $EVENT_CHECK_VECTOR
+ call send_IPI_self
+ addl $4,%esp
+ jmp restore_all_guest
+
.Lnmi_mce_xen:
/* Check the outer (guest) context for %ds/%es state validity. */
GET_CPUINFO_FIELD(CPUINFO_guest_cpu_user_regs,%ebx)
@@ -613,10 +633,6 @@ handle_nmi_mce:
jmp .Lnmi_mce_common
#endif /* !CONFIG_X86_SUPERVISOR_MODE_KERNEL */
-ENTRY(nmi)
- pushl $TRAP_nmi<<16
- jmp handle_nmi_mce
-
ENTRY(machine_check)
pushl $TRAP_machine_check<<16
jmp handle_nmi_mce
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -171,7 +171,7 @@ compat_bad_hypercall:
jmp compat_test_all_events
/* %rbx: struct vcpu, interrupts disabled */
-compat_restore_all_guest:
+ENTRY(compat_restore_all_guest)
ASSERT_INTERRUPTS_DISABLED
RESTORE_ALL
addq $8,%rsp
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -11,6 +11,7 @@
#include <asm/apicdef.h>
#include <asm/page.h>
#include <public/xen.h>
+#include <irq_vectors.h>
ALIGN
/* %rbx: struct vcpu */
@@ -617,6 +618,9 @@ ENTRY(early_page_fault)
jmp restore_all_xen
.popsection
+ENTRY(nmi)
+ pushq $0
+ movl $TRAP_nmi,4(%rsp)
handle_ist_exception:
SAVE_ALL
testb $3,UREGS_cs(%rsp)
@@ -631,12 +635,25 @@ handle_ist_exception:
movl UREGS_entry_vector(%rsp),%eax
leaq exception_table(%rip),%rdx
callq *(%rdx,%rax,8)
- jmp ret_from_intr
+ cmpb $TRAP_nmi,UREGS_entry_vector(%rsp)
+ jne ret_from_intr
-ENTRY(nmi)
- pushq $0
- movl $TRAP_nmi,4(%rsp)
- jmp handle_ist_exception
+ /* We want to get straight to the IRET on the NMI exit path. */
+ testb $3,UREGS_cs(%rsp)
+ jz restore_all_xen
+ GET_CURRENT(%rbx)
+ /* Send an IPI to ourselves to cover for the lack of event checking. */
+ movl VCPU_processor(%rbx),%eax
+ shll $IRQSTAT_shift,%eax
+ leaq irq_stat(%rip),%rcx
+ cmpl $0,(%rcx,%rax,1)
+ je 1f
+ movl $EVENT_CHECK_VECTOR,%edi
+ call send_IPI_self
+1: movq VCPU_domain(%rbx),%rax
+ cmpb $0,DOMAIN_is_32bit_pv(%rax)
+ je restore_all_guest
+ jmp compat_restore_all_guest
ENTRY(machine_check)
pushq $0

View File

@ -1,113 +0,0 @@
# Commit be6507509454adf3bb5a50b9406c88504e996d5a
# Date 2013-03-04 13:37:39 +0100
# Author George Dunlap <george.dunlap@eu.citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
credit1: Use atomic bit operations for the flags structure
The flags structure is not protected by locks (or more precisely,
it is protected using an inconsistent set of locks); we therefore need
to make sure that all accesses are atomic-safe. This is particulary
important in the case of the PARKED flag, which if clobbered while
changing the YIELD bit will leave a vcpu wedged in an offline state.
Using the atomic bitops also requires us to change the size of the "flags"
element.
Spotted-by: Igor Pavlikevich <ipavlikevich@gmail.com>
Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -58,8 +58,8 @@
/*
* Flags
*/
-#define CSCHED_FLAG_VCPU_PARKED 0x0001 /* VCPU over capped credits */
-#define CSCHED_FLAG_VCPU_YIELD 0x0002 /* VCPU yielding */
+#define CSCHED_FLAG_VCPU_PARKED 0x0 /* VCPU over capped credits */
+#define CSCHED_FLAG_VCPU_YIELD 0x1 /* VCPU yielding */
/*
@@ -132,7 +132,7 @@ struct csched_vcpu {
struct vcpu *vcpu;
atomic_t credit;
s_time_t start_time; /* When we were scheduled (used for credit) */
- uint16_t flags;
+ unsigned flags;
int16_t pri;
#ifdef CSCHED_STATS
struct {
@@ -214,7 +214,7 @@ __runq_insert(unsigned int cpu, struct c
/* If the vcpu yielded, try to put it behind one lower-priority
* runnable vcpu if we can. The next runq_sort will bring it forward
* within 30ms if the queue too long. */
- if ( svc->flags & CSCHED_FLAG_VCPU_YIELD
+ if ( test_bit(CSCHED_FLAG_VCPU_YIELD, &svc->flags)
&& __runq_elem(iter)->pri > CSCHED_PRI_IDLE )
{
iter=iter->next;
@@ -776,7 +776,7 @@ csched_vcpu_wake(const struct scheduler
* those.
*/
if ( svc->pri == CSCHED_PRI_TS_UNDER &&
- !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
+ !test_bit(CSCHED_FLAG_VCPU_PARKED, &svc->flags) )
{
svc->pri = CSCHED_PRI_TS_BOOST;
}
@@ -789,12 +789,12 @@ csched_vcpu_wake(const struct scheduler
static void
csched_vcpu_yield(const struct scheduler *ops, struct vcpu *vc)
{
- struct csched_vcpu * const sv = CSCHED_VCPU(vc);
+ struct csched_vcpu * const svc = CSCHED_VCPU(vc);
if ( !sched_credit_default_yield )
{
/* Let the scheduler know that this vcpu is trying to yield */
- sv->flags |= CSCHED_FLAG_VCPU_YIELD;
+ set_bit(CSCHED_FLAG_VCPU_YIELD, &svc->flags);
}
}
@@ -1122,11 +1122,10 @@ csched_acct(void* dummy)
/* Park running VCPUs of capped-out domains */
if ( sdom->cap != 0U &&
credit < -credit_cap &&
- !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
+ !test_and_set_bit(CSCHED_FLAG_VCPU_PARKED, &svc->flags) )
{
CSCHED_STAT_CRANK(vcpu_park);
vcpu_pause_nosync(svc->vcpu);
- svc->flags |= CSCHED_FLAG_VCPU_PARKED;
}
/* Lower bound on credits */
@@ -1142,7 +1141,7 @@ csched_acct(void* dummy)
svc->pri = CSCHED_PRI_TS_UNDER;
/* Unpark any capped domains whose credits go positive */
- if ( svc->flags & CSCHED_FLAG_VCPU_PARKED)
+ if ( test_and_clear_bit(CSCHED_FLAG_VCPU_PARKED, &svc->flags) )
{
/*
* It's important to unset the flag AFTER the unpause()
@@ -1151,7 +1150,6 @@ csched_acct(void* dummy)
*/
CSCHED_STAT_CRANK(vcpu_unpark);
vcpu_unpause(svc->vcpu);
- svc->flags &= ~CSCHED_FLAG_VCPU_PARKED;
}
/* Upper bound on credits means VCPU stops earning */
@@ -1410,8 +1408,7 @@ csched_schedule(
/*
* Clear YIELD flag before scheduling out
*/
- if ( scurr->flags & CSCHED_FLAG_VCPU_YIELD )
- scurr->flags &= ~(CSCHED_FLAG_VCPU_YIELD);
+ clear_bit(CSCHED_FLAG_VCPU_YIELD, &scurr->flags);
/*
* SMP Load balance:

View File

@ -1,36 +0,0 @@
# Commit d9fb28ae6d41c8201482948660e52889481830dd
# Date 2013-03-04 13:42:17 +0100
# Author Olaf Hering <olaf@aepfle.de>
# Committer Jan Beulich <jbeulich@suse.com>
xentrace: fix off-by-one in calculate_tbuf_size
Commit "xentrace: reduce trace buffer size to something mfn_offset can
reach" contains an off-by-one bug. max_mfn_offset needs to be reduced by
exactly the value of t_info_first_offset.
If the system has two cpus and the number of requested trace pages is
very large, the final number of trace pages + the offset will not fit
into a short. As a result the variable offset in alloc_trace_bufs() will
wrap while allocating buffers for the second cpu. Later
share_xen_page_with_privileged_guests() will be called with a wrong page
and the ASSERT in this function triggers. If the ASSERT is ignored by
running a non-dbg hypervisor the asserts in xentrace itself trigger
because "cons" is not aligned because the very last trace page for the
second cpu is a random mfn.
Thanks to Jan for the quick analysis.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
Acked-by: George Dunlap <george.dunlap@eu.citrix.com>
--- a/xen/common/trace.c
+++ b/xen/common/trace.c
@@ -133,7 +133,7 @@ static int calculate_tbuf_size(unsigned
* The array of mfns for the highest cpu can start at the maximum value
* mfn_offset can hold. So reduce the number of cpus and also the mfn_offset.
*/
- max_mfn_offset -= t_info_first_offset - 1;
+ max_mfn_offset -= t_info_first_offset;
max_cpus--;
if ( max_cpus )
max_mfn_offset /= max_cpus;

View File

@ -1,25 +0,0 @@
# Commit 9581c4f9a55372a21e759cd449cb676d0e8feddb
# Date 2013-03-06 17:10:26 +0100
# Author Matthew Daley <mattjd@gmail.com>
# Committer Jan Beulich <jbeulich@suse.com>
fix domain unlocking in some xsm error paths
A couple of xsm error/access-denied code paths in hypercalls neglect to
unlock a previously locked domain. Fix by ensuring the domains are
unlocked correctly.
Signed-off-by: Matthew Daley <mattjd@gmail.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Keir Fraser <keir@xen.org>
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -2262,7 +2262,7 @@ gnttab_get_status_frames(XEN_GUEST_HANDL
rc = xsm_grant_setup(current->domain, d);
if ( rc ) {
op.status = GNTST_permission_denied;
- goto out1;
+ goto out2;
}
gt = d->grant_table;

View File

@ -1,369 +0,0 @@
# Commit 4245d331e0e75de8d1bddbbb518f3a8ce6d0bb7e
# Date 2013-03-08 14:05:34 +0100
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/MSI: add mechanism to fully protect MSI-X table from PV guest accesses
This adds two new physdev operations for Dom0 to invoke when resource
allocation for devices is known to be complete, so that the hypervisor
can arrange for the respective MMIO ranges to be marked read-only
before an eventual guest getting such a device assigned even gets
started, such that it won't be able to set up writable mappings for
these MMIO ranges before Xen has a chance to protect them.
This also addresses another issue with the code being modified here,
in that so far write protection for the address ranges in question got
set up only once during the lifetime of a device (i.e. until either
system shutdown or device hot removal), while teardown happened when
the last interrupt was disposed of by the guest (which at least allowed
the tables to be writable when the device got assigned to a second
guest [instance] after the first terminated).
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -649,8 +649,8 @@ static u64 read_pci_mem_bar(u16 seg, u8
* @entries: pointer to an array of struct msix_entry entries
* @nvec: number of @entries
*
- * Setup the MSI-X capability structure of device function with a
- * single MSI-X irq. A return of zero indicates the successful setup of
+ * Setup the MSI-X capability structure of device function with the requested
+ * number MSI-X irqs. A return of zero indicates the successful setup of
* requested MSI-X entries with allocated irqs or non-zero for otherwise.
**/
static int msix_capability_init(struct pci_dev *dev,
@@ -658,86 +658,69 @@ static int msix_capability_init(struct p
struct msi_desc **desc,
unsigned int nr_entries)
{
- struct msi_desc *entry;
- int pos;
+ struct msi_desc *entry = NULL;
+ int pos, vf;
u16 control;
- u64 table_paddr, entry_paddr;
- u32 table_offset, entry_offset;
- u8 bir;
- void __iomem *base;
- int idx;
+ u64 table_paddr;
+ u32 table_offset;
+ u8 bir, pbus, pslot, pfunc;
u16 seg = dev->seg;
u8 bus = dev->bus;
u8 slot = PCI_SLOT(dev->devfn);
u8 func = PCI_FUNC(dev->devfn);
ASSERT(spin_is_locked(&pcidevs_lock));
- ASSERT(desc);
pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
- /* MSI-X Table Initialization */
- entry = alloc_msi_entry();
- if ( !entry )
- return -ENOMEM;
+ if ( desc )
+ {
+ entry = alloc_msi_entry();
+ if ( !entry )
+ return -ENOMEM;
+ ASSERT(msi);
+ }
- /* Request & Map MSI-X table region */
+ /* Locate MSI-X table region */
table_offset = pci_conf_read32(seg, bus, slot, func,
msix_table_offset_reg(pos));
bir = (u8)(table_offset & PCI_MSIX_BIRMASK);
table_offset &= ~PCI_MSIX_BIRMASK;
- entry_offset = msi->entry_nr * PCI_MSIX_ENTRY_SIZE;
- table_paddr = msi->table_base + table_offset;
- entry_paddr = table_paddr + entry_offset;
- idx = msix_get_fixmap(dev, table_paddr, entry_paddr);
- if ( idx < 0 )
- {
- xfree(entry);
- return idx;
- }
- base = (void *)(fix_to_virt(idx) +
- ((unsigned long)entry_paddr & ((1UL << PAGE_SHIFT) - 1)));
-
- entry->msi_attrib.type = PCI_CAP_ID_MSIX;
- entry->msi_attrib.is_64 = 1;
- entry->msi_attrib.entry_nr = msi->entry_nr;
- entry->msi_attrib.maskbit = 1;
- entry->msi_attrib.masked = 1;
- entry->msi_attrib.pos = pos;
- entry->irq = msi->irq;
- entry->dev = dev;
- entry->mask_base = base;
-
- list_add_tail(&entry->list, &dev->msi_list);
-
- if ( !dev->msix_nr_entries )
+ if ( !dev->info.is_virtfn )
{
- u8 pbus, pslot, pfunc;
- int vf;
- u64 pba_paddr;
- u32 pba_offset;
+ pbus = bus;
+ pslot = slot;
+ pfunc = func;
+ vf = -1;
+ }
+ else
+ {
+ pbus = dev->info.physfn.bus;
+ pslot = PCI_SLOT(dev->info.physfn.devfn);
+ pfunc = PCI_FUNC(dev->info.physfn.devfn);
+ vf = PCI_BDF2(dev->bus, dev->devfn);
+ }
- if ( !dev->info.is_virtfn )
- {
- pbus = bus;
- pslot = slot;
- pfunc = func;
- vf = -1;
- }
- else
+ table_paddr = read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf);
+ WARN_ON(msi && msi->table_base != table_paddr);
+ if ( !table_paddr )
+ {
+ if ( !msi || !msi->table_base )
{
- pbus = dev->info.physfn.bus;
- pslot = PCI_SLOT(dev->info.physfn.devfn);
- pfunc = PCI_FUNC(dev->info.physfn.devfn);
- vf = PCI_BDF2(dev->bus, dev->devfn);
+ xfree(entry);
+ return -ENXIO;
}
+ table_paddr = msi->table_base;
+ }
+ table_paddr += table_offset;
- ASSERT(!dev->msix_used_entries);
- WARN_ON(msi->table_base !=
- read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf));
+ if ( !dev->msix_used_entries )
+ {
+ u64 pba_paddr;
+ u32 pba_offset;
dev->msix_nr_entries = nr_entries;
dev->msix_table.first = PFN_DOWN(table_paddr);
@@ -758,7 +741,42 @@ static int msix_capability_init(struct p
BITS_TO_LONGS(nr_entries) - 1);
WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, dev->msix_pba.first,
dev->msix_pba.last));
+ }
+
+ if ( entry )
+ {
+ /* Map MSI-X table region */
+ u64 entry_paddr = table_paddr + msi->entry_nr * PCI_MSIX_ENTRY_SIZE;
+ int idx = msix_get_fixmap(dev, table_paddr, entry_paddr);
+ void __iomem *base;
+
+ if ( idx < 0 )
+ {
+ xfree(entry);
+ return idx;
+ }
+ base = (void *)(fix_to_virt(idx) +
+ ((unsigned long)entry_paddr & (PAGE_SIZE - 1)));
+ /* Mask interrupt here */
+ writel(1, base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+
+ entry->msi_attrib.type = PCI_CAP_ID_MSIX;
+ entry->msi_attrib.is_64 = 1;
+ entry->msi_attrib.entry_nr = msi->entry_nr;
+ entry->msi_attrib.maskbit = 1;
+ entry->msi_attrib.masked = 1;
+ entry->msi_attrib.pos = pos;
+ entry->irq = msi->irq;
+ entry->dev = dev;
+ entry->mask_base = base;
+
+ list_add_tail(&entry->list, &dev->msi_list);
+ *desc = entry;
+ }
+
+ if ( !dev->msix_used_entries )
+ {
if ( rangeset_add_range(mmio_ro_ranges, dev->msix_table.first,
dev->msix_table.last) )
WARN();
@@ -769,7 +787,7 @@ static int msix_capability_init(struct p
if ( dev->domain )
p2m_change_entry_type_global(dev->domain,
p2m_mmio_direct, p2m_mmio_direct);
- if ( !dev->domain || !paging_mode_translate(dev->domain) )
+ if ( desc && (!dev->domain || !paging_mode_translate(dev->domain)) )
{
struct domain *d = dev->domain;
@@ -783,6 +801,13 @@ static int msix_capability_init(struct p
break;
if ( d )
{
+ if ( !IS_PRIV(d) && dev->msix_warned != d->domain_id )
+ {
+ dev->msix_warned = d->domain_id;
+ printk(XENLOG_ERR
+ "Potentially insecure use of MSI-X on %04x:%02x:%02x.%u by Dom%d\n",
+ seg, bus, slot, func, d->domain_id);
+ }
/* XXX How to deal with existing mappings? */
}
}
@@ -791,10 +816,6 @@ static int msix_capability_init(struct p
WARN_ON(dev->msix_table.first != (table_paddr >> PAGE_SHIFT));
++dev->msix_used_entries;
- /* Mask interrupt here */
- writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
-
- *desc = entry;
/* Restore MSI-X enabled bits */
pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
@@ -919,6 +940,19 @@ static int __pci_enable_msix(struct msi_
return status;
}
+static void _pci_cleanup_msix(struct pci_dev *dev)
+{
+ if ( !--dev->msix_used_entries )
+ {
+ if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_table.first,
+ dev->msix_table.last) )
+ WARN();
+ if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_pba.first,
+ dev->msix_pba.last) )
+ WARN();
+ }
+}
+
static void __pci_disable_msix(struct msi_desc *entry)
{
struct pci_dev *dev;
@@ -942,15 +976,45 @@ static void __pci_disable_msix(struct ms
pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
- if ( !--dev->msix_used_entries )
+ _pci_cleanup_msix(dev);
+}
+
+int pci_prepare_msix(u16 seg, u8 bus, u8 devfn, bool_t off)
+{
+ int rc;
+ struct pci_dev *pdev;
+ u8 slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
+ unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
+ PCI_CAP_ID_MSIX);
+
+ if ( !use_msi )
+ return 0;
+
+ if ( !pos )
+ return -ENODEV;
+
+ spin_lock(&pcidevs_lock);
+ pdev = pci_get_pdev(seg, bus, devfn);
+ if ( !pdev )
+ rc = -ENODEV;
+ else if ( pdev->msix_used_entries != !!off )
+ rc = -EBUSY;
+ else if ( off )
{
- if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_table.first,
- dev->msix_table.last) )
- WARN();
- if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_pba.first,
- dev->msix_pba.last) )
- WARN();
+ _pci_cleanup_msix(pdev);
+ rc = 0;
}
+ else
+ {
+ u16 control = pci_conf_read16(seg, bus, slot, func,
+ msix_control_reg(pos));
+
+ rc = msix_capability_init(pdev, NULL, NULL,
+ multi_msix_capable(control));
+ }
+ spin_unlock(&pcidevs_lock);
+
+ return rc;
}
/*
--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
@@ -609,6 +609,18 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
break;
}
+ case PHYSDEVOP_prepare_msix:
+ case PHYSDEVOP_release_msix: {
+ struct physdev_pci_device dev;
+
+ if ( copy_from_guest(&dev, arg, 1) )
+ ret = -EFAULT;
+ else
+ ret = pci_prepare_msix(dev.seg, dev.bus, dev.devfn,
+ cmd != PHYSDEVOP_prepare_msix);
+ break;
+ }
+
#ifdef __x86_64__
case PHYSDEVOP_pci_mmcfg_reserved: {
struct physdev_pci_mmcfg_reserved info;
--- a/xen/include/asm-x86/msi.h
+++ b/xen/include/asm-x86/msi.h
@@ -80,6 +80,7 @@ struct msi_desc;
/* Helper functions */
extern int pci_enable_msi(struct msi_info *msi, struct msi_desc **desc);
extern void pci_disable_msi(struct msi_desc *desc);
+extern int pci_prepare_msix(u16 seg, u8 bus, u8 devfn, bool_t off);
extern void pci_cleanup_msi(struct pci_dev *pdev);
extern void setup_msi_handler(struct irq_desc *, struct msi_desc *);
extern void setup_msi_irq(struct irq_desc *);
--- a/xen/include/public/physdev.h
+++ b/xen/include/public/physdev.h
@@ -303,6 +303,12 @@ DEFINE_XEN_GUEST_HANDLE(physdev_pci_devi
#define PHYSDEVOP_pci_device_remove 26
#define PHYSDEVOP_restore_msi_ext 27
+/*
+ * Dom0 should use these two to announce MMIO resources assigned to
+ * MSI-X capable devices won't (prepare) or may (release) change.
+ */
+#define PHYSDEVOP_prepare_msix 30
+#define PHYSDEVOP_release_msix 31
struct physdev_pci_device {
/* IN */
uint16_t seg;
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -57,6 +57,7 @@ struct pci_dev {
int msix_table_refcnt[MAX_MSIX_TABLE_PAGES];
int msix_table_idx[MAX_MSIX_TABLE_PAGES];
spinlock_t msix_table_lock;
+ domid_t msix_warned;
struct domain *domain;
const u16 seg;

View File

@ -1,139 +0,0 @@
# Commit 1d80765b504b34b63a42a63aff4291e07e29f0c5
# Date 2013-03-12 15:34:22 +0100
# Author Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
# Committer Jan Beulich <jbeulich@suse.com>
powernow: add fixups for AMD P-state figures
In the Linux kernel, these two git commits:
- f594065faf4f9067c2283a34619fc0714e79a98d
ACPI: Add fixups for AMD P-state figures
- 9855d8ce41a7801548a05d844db2f46c3e810166
ACPI: Check MSR valid bit before using P-state frequencies
Try to fix the the issue that "some AMD systems may round the
frequencies in ACPI tables to 100MHz boundaries. We can obtain the real
frequencies from MSRs, so add a quirk to fix these frequencies up
on AMD systems." (from f594065..)
In discussion (around 9855d8..) "it turned out that indeed real
HW/BIOSes may choose to not set the valid bit and thus mark the
P-state as invalid. So this could be considered a fix for broken
BIOSes." (from 9855d8..)
which is great for Linux. Unfortunatly the Linux kernel, when
it tries to do the RDMSR under Xen it fails to get the right
value (it gets zero) as Xen traps it and returns zero. Hence
when dom0 uploads the P-states they will be unmodified and
we should take care of updating the frequencies with the right
values.
I've tested it under Dell Inc. PowerEdge T105 /0RR825, BIOS 1.3.2
08/20/2008 where this quirk can be observed (x86 == 0x10, model == 2).
Also on other AMD (x86 == 0x12, A8-3850; x86 = 0x14, AMD E-350) to
make sure the quirk is not applied there.
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: stefan.bader@canonical.com
Do the MSR access here (and while at it, also the one reading
MSR_PSTATE_CUR_LIMIT) on the target CPU, and bound the loop over
amd_fixup_frequency() by max_hw_pstate (matching the one in
powernow_cpufreq_cpu_init()).
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/acpi/cpufreq/powernow.c
+++ b/xen/arch/x86/acpi/cpufreq/powernow.c
@@ -159,6 +159,51 @@ static int powernow_cpufreq_target(struc
return result;
}
+static void amd_fixup_frequency(struct xen_processor_px *px)
+{
+ u32 hi, lo, fid, did;
+ int index = px->control & 0x00000007;
+ const struct cpuinfo_x86 *c = &current_cpu_data;
+
+ if ((c->x86 != 0x10 || c->x86_model >= 10) && c->x86 != 0x11)
+ return;
+
+ rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
+ /*
+ * MSR C001_0064+:
+ * Bit 63: PstateEn. Read-write. If set, the P-state is valid.
+ */
+ if (!(hi & (1U << 31)))
+ return;
+
+ fid = lo & 0x3f;
+ did = (lo >> 6) & 7;
+ if (c->x86 == 0x10)
+ px->core_frequency = (100 * (fid + 16)) >> did;
+ else
+ px->core_frequency = (100 * (fid + 8)) >> did;
+}
+
+struct amd_cpu_data {
+ struct processor_performance *perf;
+ u32 max_hw_pstate;
+};
+
+static void get_cpu_data(void *arg)
+{
+ struct amd_cpu_data *data = arg;
+ struct processor_performance *perf = data->perf;
+ uint64_t msr_content;
+ unsigned int i;
+
+ rdmsrl(MSR_PSTATE_CUR_LIMIT, msr_content);
+ data->max_hw_pstate = (msr_content & HW_PSTATE_MAX_MASK) >>
+ HW_PSTATE_MAX_SHIFT;
+
+ for (i = 0; i < perf->state_count && i <= data->max_hw_pstate; i++)
+ amd_fixup_frequency(&perf->states[i]);
+}
+
static int powernow_cpufreq_verify(struct cpufreq_policy *policy)
{
struct acpi_cpufreq_data *data;
@@ -205,8 +250,7 @@ static int powernow_cpufreq_cpu_init(str
struct acpi_cpufreq_data *data;
unsigned int result = 0;
struct processor_performance *perf;
- u32 max_hw_pstate;
- uint64_t msr_content;
+ struct amd_cpu_data info;
struct cpuinfo_x86 *c = &cpu_data[policy->cpu];
data = xzalloc(struct acpi_cpufreq_data);
@@ -217,7 +261,7 @@ static int powernow_cpufreq_cpu_init(str
data->acpi_data = &processor_pminfo[cpu]->perf;
- perf = data->acpi_data;
+ info.perf = perf = data->acpi_data;
policy->shared_type = perf->shared_type;
if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
@@ -239,8 +283,6 @@ static int powernow_cpufreq_cpu_init(str
result = -ENODEV;
goto err_unreg;
}
- rdmsrl(MSR_PSTATE_CUR_LIMIT, msr_content);
- max_hw_pstate = (msr_content & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
if (perf->control_register.space_id != perf->status_register.space_id) {
result = -ENODEV;
@@ -265,8 +307,10 @@ static int powernow_cpufreq_cpu_init(str
policy->governor = cpufreq_opt_governor ? : CPUFREQ_DEFAULT_GOVERNOR;
+ on_selected_cpus(cpumask_of(cpu), get_cpu_data, &info, 1);
+
/* table init */
- for (i = 0; i < perf->state_count && i <= max_hw_pstate; i++) {
+ for (i = 0; i < perf->state_count && i <= info.max_hw_pstate; i++) {
if (i > 0 && perf->states[i].core_frequency >=
data->freq_table[valid_states-1].frequency / 1000)
continue;

View File

@ -1,72 +0,0 @@
References: bnc#805579
# Commit b0583c0e64cc8bb6229c95c3304fdac2051f79b3
# Date 2013-03-12 15:53:30 +0100
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/MCA: suppress bank clearing for certain injected events
As the bits indicating validity of the ADDR and MISC bank MSRs may be
injected in a way that isn't consistent with what the underlying
hardware implements (while the bank must be valid for injection to
work, the auxiliary MSRs may not be implemented - and hence cause #GP
upon access - if the hardware never sets the corresponding valid bits.
Consequently we need to do the clearing writes only if no value was
interposed for the respective MSR (which also makes sense the other way
around: there's no point in clearing a hardware register when all data
read came from software). Of course this all requires the injection
tool to do things in a consistent way (but that had been a requirement
before already).
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Tested-by: Ren Yongjie <yongjie.ren@intel.com>
Acked-by: Liu Jinsong <jinsong.liu@intel.com>
--- a/xen/arch/x86/cpu/mcheck/mce.c
+++ b/xen/arch/x86/cpu/mcheck/mce.c
@@ -1145,13 +1145,15 @@ static void intpose_add(unsigned int cpu
printk("intpose_add: interpose array full - request dropped\n");
}
-void intpose_inval(unsigned int cpu_nr, uint64_t msr)
+bool_t intpose_inval(unsigned int cpu_nr, uint64_t msr)
{
- struct intpose_ent *ent;
+ struct intpose_ent *ent = intpose_lookup(cpu_nr, msr, NULL);
- if ((ent = intpose_lookup(cpu_nr, msr, NULL)) != NULL) {
- ent->cpu_nr = -1;
- }
+ if ( !ent )
+ return 0;
+
+ ent->cpu_nr = -1;
+ return 1;
}
#define IS_MCA_BANKREG(r) \
--- a/xen/arch/x86/cpu/mcheck/mce.h
+++ b/xen/arch/x86/cpu/mcheck/mce.h
@@ -89,7 +89,7 @@ extern void mce_recoverable_register(mce
/* Read an MSR, checking for an interposed value first */
extern struct intpose_ent *intpose_lookup(unsigned int, uint64_t,
uint64_t *);
-extern void intpose_inval(unsigned int, uint64_t);
+extern bool_t intpose_inval(unsigned int, uint64_t);
static inline uint64_t mca_rdmsr(unsigned int msr)
{
@@ -101,9 +101,9 @@ static inline uint64_t mca_rdmsr(unsigne
/* Write an MSR, invalidating any interposed value */
#define mca_wrmsr(msr, val) do { \
- intpose_inval(smp_processor_id(), msr); \
- wrmsrl(msr, val); \
-} while (0)
+ if ( !intpose_inval(smp_processor_id(), msr) ) \
+ wrmsrl(msr, val); \
+} while ( 0 )
/* Utility function to "logout" all architectural MCA telemetry from the MCA

View File

@ -1,32 +0,0 @@
# Commit 0f7b6f91ac1bbfd33b23c291b14874b9561909d2
# Date 2013-03-20 10:00:01 +0100
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
AMD/IOMMU: Process softirqs while building dom0 iommu mappings
Recent changes which have made their way into xen-4.2 stable have pushed the
runtime of construct_dom0() over 5 seconds, which has caused regressions in
XenServer testing because of our 5 second watchdog.
The root cause is that amd_iommu_dom0_init() does not process softirqs and in
particular the nmi_timer which causes the watchdog to decide that no useful
progress is being made.
This patch adds periodic calls to process_pending_softirqs() at the same
interval as the Intel variant of this function. The server which was failing
with the watchdog test now boots reliably with a timeout of 1 second.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -285,6 +285,9 @@ static void __init amd_iommu_dom0_init(s
if ( mfn_valid(pfn) )
amd_iommu_map_page(d, pfn, pfn,
IOMMUF_readable|IOMMUF_writable);
+
+ if ( !(i & 0xfffff) )
+ process_pending_softirqs();
}
}

View File

@ -1,22 +0,0 @@
# Commit 32861c537781ac94bf403fb778505c3679b85f67
# Date 2013-03-20 10:02:26 +0100
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
VT-d: Enumerate IOMMUs when listing capabilities
This saves N identical console log lines on a multi-iommu server.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -2135,7 +2135,8 @@ int __init intel_vtd_setup(void)
{
iommu = drhd->iommu;
- printk("Intel VT-d supported page sizes: 4kB");
+ printk("Intel VT-d iommu %"PRIu32" supported page sizes: 4kB",
+ iommu->index);
if (cap_sps_2mb(iommu->cap))
printk(", 2MB");

View File

@ -1,28 +0,0 @@
# Commit 759847e44401176401e86e7c55b644cb9f93c781
# Date 2013-03-20 10:02:52 +0100
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
ACPI/ERST: Name table in otherwise opaque error messages
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Fix spelling and lower severities.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/drivers/acpi/apei/erst.c
+++ b/xen/drivers/acpi/apei/erst.c
@@ -799,11 +799,11 @@ int __init erst_init(void)
status = acpi_get_table(ACPI_SIG_ERST, 0,
(struct acpi_table_header **)&erst_tab);
if (status == AE_NOT_FOUND) {
- printk(KERN_ERR "Table is not found!\n");
+ printk(KERN_INFO "ERST table was not found\n");
return -ENODEV;
} else if (ACPI_FAILURE(status)) {
const char *msg = acpi_format_exception(status);
- printk(KERN_ERR "Failed to get table, %s\n", msg);
+ printk(KERN_WARNING "Failed to get ERST table: %s\n", msg);
return -EINVAL;
}

View File

@ -1,34 +0,0 @@
References: bnc#785211
# Commit 0611689d9153227831979c7bafe594214b8505a3
# Date 2013-03-22 09:43:38 +0100
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
ACPI/APEI: Unlock apei_iomaps_lock on error path
This causes deadlocks during early boot on hardware with broken/buggy
APEI implementations, such as a Dell Poweredge 2950 with the latest
currently available BIOS.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Don't use goto or another special error path, as handling the error
case in normal flow is quite simple.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/drivers/acpi/apei/apei-io.c
+++ b/xen/drivers/acpi/apei/apei-io.c
@@ -146,10 +146,8 @@ static void __init apei_post_unmap(paddr
spin_lock_irqsave(&apei_iomaps_lock, flags);
map = __apei_find_iomap(paddr, size);
- if (!map)
- return;
-
- list_del(&map->list);
+ if (map)
+ list_del(&map->list);
spin_unlock_irqrestore(&apei_iomaps_lock, flags);
xfree(map);

View File

@ -1,70 +0,0 @@
References: bnc#785211
# Commit 72af01bf6f7489e54ad59270222a29d3e8c501d1
# Date 2013-03-22 12:46:25 +0100
# Author Huang Ying <ying.huang@intel.com>
# Committer Jan Beulich <jbeulich@suse.com>
ACPI, APEI: Add apei_exec_run_optional
Some actions in APEI ERST and EINJ tables are optional, for example,
ACPI_EINJ_BEGIN_OPERATION action is used to do some preparation for
error injection, and firmware may choose to do nothing here. While
some other actions are mandatory, for example, firmware must provide
ACPI_EINJ_GET_ERROR_TYPE implementation.
Original implementation treats all actions as optional (that is, can
have no instructions), that may cause issue if firmware does not
provide some mandatory actions. To fix this, this patch adds
apei_exec_run_optional, which should be used for optional actions.
The original apei_exec_run should be used for mandatory actions.
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Tested-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/drivers/acpi/apei/apei-base.c
+++ b/xen/drivers/acpi/apei/apei-base.c
@@ -154,9 +154,10 @@ int apei_exec_noop(struct apei_exec_cont
* Interpret the specified action. Go through whole action table,
* execute all instructions belong to the action.
*/
-int apei_exec_run(struct apei_exec_context *ctx, u8 action)
+int __apei_exec_run(struct apei_exec_context *ctx, u8 action,
+ bool_t optional)
{
- int rc;
+ int rc = -ENOENT;
u32 i, ip;
struct acpi_whea_header *entry;
apei_exec_ins_func_t run;
@@ -195,7 +196,7 @@ rewind:
goto rewind;
}
- return 0;
+ return !optional && rc < 0 ? rc : 0;
}
typedef int (*apei_exec_entry_func_t)(struct apei_exec_context *ctx,
--- a/xen/drivers/acpi/apei/apei-internal.h
+++ b/xen/drivers/acpi/apei/apei-internal.h
@@ -48,7 +48,18 @@ static inline u64 apei_exec_ctx_get_outp
return ctx->value;
}
-int apei_exec_run(struct apei_exec_context *ctx, u8 action);
+int __apei_exec_run(struct apei_exec_context *ctx, u8 action, bool_t optional);
+
+static inline int apei_exec_run(struct apei_exec_context *ctx, u8 action)
+{
+ return __apei_exec_run(ctx, action, 0);
+}
+
+/* It is optional whether the firmware provides the action */
+static inline int apei_exec_run_optional(struct apei_exec_context *ctx, u8 action)
+{
+ return __apei_exec_run(ctx, action, 1);
+}
/* Common instruction implementation */

View File

@ -1,96 +0,0 @@
# Commit fae0372140befb88d890a30704a8ec058c902af8
# Date 2013-03-25 14:28:31 +0100
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
IOMMU: properly check whether interrupt remapping is enabled
... rather than the IOMMU as a whole.
That in turn required to make sure iommu_intremap gets properly
cleared when the respective initialization fails (or isn't being
done at all).
Along with making sure interrupt remapping doesn't get inconsistently
enabled on some IOMMUs and not on others in the VT-d code, this in turn
allowed quite a bit of cleanup on the VT-d side (removed from the
backport).
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -204,7 +204,7 @@ static void read_msi_msg(struct msi_desc
BUG();
}
- if ( iommu_enabled )
+ if ( iommu_intremap )
iommu_read_msi_from_ire(entry, msg);
}
@@ -212,7 +212,7 @@ static void write_msi_msg(struct msi_des
{
entry->msg = *msg;
- if ( iommu_enabled )
+ if ( iommu_intremap )
{
ASSERT(msg != &entry->msg);
iommu_update_ire_from_msi(entry, msg);
@@ -482,7 +482,7 @@ int msi_free_irq(struct msi_desc *entry)
}
/* Free the unused IRTE if intr remap enabled */
- if ( iommu_enabled )
+ if ( iommu_intremap )
iommu_update_ire_from_msi(entry, NULL);
list_del(&entry->list);
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -469,6 +469,8 @@ int __init iommu_setup(void)
rc = iommu_hardware_setup();
iommu_enabled = (rc == 0);
}
+ if ( !iommu_enabled )
+ iommu_intremap = 0;
if ( (force_iommu && !iommu_enabled) ||
(force_intremap && !iommu_intremap) )
@@ -485,9 +487,12 @@ int __init iommu_setup(void)
}
printk("I/O virtualisation %sabled\n", iommu_enabled ? "en" : "dis");
if ( iommu_enabled )
+ {
printk(" - Dom0 mode: %s\n",
iommu_passthrough ? "Passthrough" :
iommu_dom0_strict ? "Strict" : "Relaxed");
+ printk("Interrupt remapping %sabled\n", iommu_intremap ? "en" : "dis");
+ }
return rc;
}
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -2072,6 +2072,9 @@ static int init_vtd_hw(void)
break;
}
}
+ if ( !iommu_intremap )
+ for_each_drhd_unit ( drhd )
+ disable_intremap(drhd->iommu);
}
/*
--- a/xen/include/asm-x86/io_apic.h
+++ b/xen/include/asm-x86/io_apic.h
@@ -129,7 +129,7 @@ struct IO_APIC_route_entry {
extern struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
/* Only need to remap ioapic RTE (reg: 10~3Fh) */
-#define ioapic_reg_remapped(reg) (iommu_enabled && ((reg) >= 0x10))
+#define ioapic_reg_remapped(reg) (iommu_intremap && ((reg) >= 0x10))
static inline unsigned int __io_apic_read(unsigned int apic, unsigned int reg)
{

View File

@ -1,90 +0,0 @@
References: bnc#801910
# Commit 6890cebc6a987d0e896f5d23a8de11a3934101cf
# Date 2013-03-25 14:31:27 +0100
# Author Malcolm Crossley <malcolm.crossley@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
VT-d: deal with 5500/5520/X58 errata
http://www.intel.com/content/www/us/en/chipsets/5520-and-5500-chipset-ioh-specification-update.html
Stepping B-3 has two errata (#47 and #53) related to Interrupt
remapping, to which the workaround is for the BIOS to completely disable
interrupt remapping. These errata are fixed in stepping C-2.
Unfortunately this chipset stepping is very common and many BIOSes are
not disabling interrupt remapping on this stepping . We can detect this in
Xen and prevent Xen from using the problematic interrupt remapping feature.
The Intel 5500/5520/X58 chipset does not support VT-d
Extended Interrupt Mode(EIM). This means the iommu_supports_eim() check
always fails and so x2apic mode cannot be enabled in Xen before this quirk
disables the interrupt remapping feature.
Signed-off-by: Malcolm Crossley <malcolm.crossley@citrix.com>
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Gate the function call to check the quirk on interrupt remapping being
requested to get enabled, and upon failure disable the IOMMU to be in
line with what the changes for XSA-36 (plus follow-ups) did.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: "Zhang, Xiantao" <xiantao.zhang@intel.com>
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -2129,6 +2129,11 @@ int __init intel_vtd_setup(void)
}
platform_quirks_init();
+ if ( !iommu_enabled )
+ {
+ ret = -ENODEV;
+ goto error;
+ }
/* We enable the following features only if they are supported by all VT-d
* engines: Snoop Control, DMA passthrough, Queued Invalidation and
--- a/xen/drivers/passthrough/vtd/quirks.c
+++ b/xen/drivers/passthrough/vtd/quirks.c
@@ -248,6 +248,29 @@ void vtd_ops_postamble_quirk(struct iomm
}
}
+/* 5500/5520/X58 Chipset Interrupt remapping errata, for stepping B-3.
+ * Fixed in stepping C-2. */
+static void __init tylersburg_intremap_quirk(void)
+{
+ uint32_t bus, device;
+ uint8_t rev;
+
+ for ( bus = 0; bus < 0x100; bus++ )
+ {
+ /* Match on System Management Registers on Device 20 Function 0 */
+ device = pci_conf_read32(0, bus, 20, 0, PCI_VENDOR_ID);
+ rev = pci_conf_read8(0, bus, 20, 0, PCI_REVISION_ID);
+
+ if ( rev == 0x13 && device == 0x342e8086 )
+ {
+ printk(XENLOG_WARNING VTDPREFIX
+ "Disabling IOMMU due to Intel 5500/5520/X58 Chipset errata #47, #53\n");
+ iommu_enabled = 0;
+ break;
+ }
+ }
+}
+
/* initialize platform identification flags */
void __init platform_quirks_init(void)
{
@@ -268,6 +291,10 @@ void __init platform_quirks_init(void)
/* ioremap IGD MMIO+0x2000 page */
map_igd_reg();
+
+ /* Tylersburg interrupt remap quirk */
+ if ( iommu_intremap )
+ tylersburg_intremap_quirk();
}
/*

View File

@ -1,63 +0,0 @@
# Commit 92b8bc03bd4b582cb524db51494d0dba7607e7ac
# Date 2013-03-25 16:55:22 +0100
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
AMD IOMMU: allow disabling only interrupt remapping when certain IVRS consistency checks fail
After some more thought on the XSA-36 and specifically the comments we
got regarding disabling the IOMMU in this situation altogether making
things worse instead of better, I came to the conclusion that we can
actually restrict the action in affected cases to just disabling
interrupt remapping. That doesn't make the situation worse than prior
to the XSA-36 fixes (where interrupt remapping didn't really protect
domains from one another), but allows at least DMA isolation to still
be utilized.
To do so, disabling of interrupt remapping must be explicitly requested
on the command line - respective checks will then be skipped.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Suravee Suthikulanit <suravee.suthikulpanit@amd.com>
--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -664,6 +664,9 @@ static u16 __init parse_ivhd_device_spec
return dev_length;
}
+ if ( !iommu_intremap )
+ return dev_length;
+
/*
* Some BIOSes have IOAPIC broken entries so we check for IVRS
* consistency here --- whether entry's IOAPIC ID is valid and
@@ -902,7 +905,7 @@ static int __init parse_ivrs_table(struc
}
/* Each IO-APIC must have been mentioned in the table. */
- for ( apic = 0; !error && apic < nr_ioapics; ++apic )
+ for ( apic = 0; !error && iommu_intremap && apic < nr_ioapics; ++apic )
{
if ( !nr_ioapic_entries[apic] ||
ioapic_sbdf[IO_APIC_ID(apic)].pin_setup )
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -1192,7 +1192,8 @@ int __init amd_iommu_init(void)
BUG_ON( !iommu_found() );
- if ( amd_iommu_perdev_intremap && amd_sp5100_erratum28() )
+ if ( iommu_intremap && amd_iommu_perdev_intremap &&
+ amd_sp5100_erratum28() )
goto error_out;
ivrs_bdf_entries = amd_iommu_get_ivrs_dev_entries();
@@ -1209,7 +1210,7 @@ int __init amd_iommu_init(void)
goto error_out;
/* initialize io-apic interrupt remapping entries */
- if ( amd_iommu_setup_ioapic_remapping() != 0 )
+ if ( iommu_intremap && amd_iommu_setup_ioapic_remapping() != 0 )
goto error_out;
/* allocate and initialize a global device table shared by all iommus */

View File

@ -0,0 +1,144 @@
# Commit 9aa356bc9f7533c3cb7f02c823f532532876d444
# Date 2013-04-19 12:29:01 +0200
# Author Ben Guthro <benjamin.guthro@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/S3: Fix cpu pool scheduling after suspend/resume
This review is another S3 scheduler problem with the system_state
variable introduced with the following changeset:
http://xenbits.xen.org/gitweb/?p=xen.git;a=commit;h=269f543ea750ed567d18f2e819e5d5ce58eda5c5
Specifically, the cpu_callback function that takes the CPU down during
suspend, and back up during resume. We were seeing situations where,
after S3, only CPU0 was in cpupool0. Guest performance suffered
greatly, since all vcpus were only on a single pcpu. Guests under high
CPU load showed the problem much more quickly than an idle guest.
Removing this if condition forces the CPUs to go through the expected
online/offline state, and be properly scheduled after S3.
This also includes a necessary partial change proposed earlier by
Tomasz Wroblewski here:
http://lists.xen.org/archives/html/xen-devel/2013-01/msg02206.html
It should also resolve the issues discussed in this thread:
http://lists.xen.org/archives/html/xen-devel/2012-11/msg01801.html
Signed-off-by: Ben Guthro <benjamin.guthro@citrix.com>
Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com>
--- a/xen/common/cpupool.c
+++ b/xen/common/cpupool.c
@@ -41,16 +41,28 @@ static struct cpupool *alloc_cpupool_str
{
struct cpupool *c = xzalloc(struct cpupool);
- if ( c && zalloc_cpumask_var(&c->cpu_valid) )
- return c;
- xfree(c);
- return NULL;
+ if ( !c || !zalloc_cpumask_var(&c->cpu_valid) )
+ {
+ xfree(c);
+ c = NULL;
+ }
+ else if ( !zalloc_cpumask_var(&c->cpu_suspended) )
+ {
+ free_cpumask_var(c->cpu_valid);
+ xfree(c);
+ c = NULL;
+ }
+
+ return c;
}
static void free_cpupool_struct(struct cpupool *c)
{
if ( c )
+ {
+ free_cpumask_var(c->cpu_suspended);
free_cpumask_var(c->cpu_valid);
+ }
xfree(c);
}
@@ -417,14 +429,32 @@ void cpupool_rm_domain(struct domain *d)
/*
* called to add a new cpu to pool admin
- * we add a hotplugged cpu to the cpupool0 to be able to add it to dom0
+ * we add a hotplugged cpu to the cpupool0 to be able to add it to dom0,
+ * unless we are resuming from S3, in which case we put the cpu back
+ * in the cpupool it was in prior to suspend.
*/
static void cpupool_cpu_add(unsigned int cpu)
{
spin_lock(&cpupool_lock);
cpumask_clear_cpu(cpu, &cpupool_locked_cpus);
cpumask_set_cpu(cpu, &cpupool_free_cpus);
- cpupool_assign_cpu_locked(cpupool0, cpu);
+
+ if ( system_state == SYS_STATE_resume )
+ {
+ struct cpupool **c;
+
+ for_each_cpupool(c)
+ {
+ if ( cpumask_test_cpu(cpu, (*c)->cpu_suspended ) )
+ {
+ cpupool_assign_cpu_locked(*c, cpu);
+ cpumask_clear_cpu(cpu, (*c)->cpu_suspended);
+ }
+ }
+ }
+
+ if ( cpumask_test_cpu(cpu, &cpupool_free_cpus) )
+ cpupool_assign_cpu_locked(cpupool0, cpu);
spin_unlock(&cpupool_lock);
}
@@ -436,7 +466,7 @@ static void cpupool_cpu_add(unsigned int
static int cpupool_cpu_remove(unsigned int cpu)
{
int ret = 0;
-
+
spin_lock(&cpupool_lock);
if ( !cpumask_test_cpu(cpu, cpupool0->cpu_valid))
ret = -EBUSY;
@@ -633,9 +663,14 @@ static int cpu_callback(
unsigned int cpu = (unsigned long)hcpu;
int rc = 0;
- if ( (system_state == SYS_STATE_suspend) ||
- (system_state == SYS_STATE_resume) )
- goto out;
+ if ( system_state == SYS_STATE_suspend )
+ {
+ struct cpupool **c;
+
+ for_each_cpupool(c)
+ if ( cpumask_test_cpu(cpu, (*c)->cpu_valid ) )
+ cpumask_set_cpu(cpu, (*c)->cpu_suspended);
+ }
switch ( action )
{
@@ -650,7 +685,6 @@ static int cpu_callback(
break;
}
-out:
return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
}
--- a/xen/include/xen/sched-if.h
+++ b/xen/include/xen/sched-if.h
@@ -199,6 +199,7 @@ struct cpupool
{
int cpupool_id;
cpumask_var_t cpu_valid; /* all cpus assigned to pool */
+ cpumask_var_t cpu_suspended; /* cpus in S3 that should be in this pool */
struct cpupool *next;
unsigned int n_dom;
struct scheduler *sched;

View File

@ -0,0 +1,142 @@
References: FATE#314499, FATE#314509
# Commit 9be8a4447103d92843fcfeaad8be42408c90e9a9
# Date 2013-04-22 13:58:01 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/EFI: pass boot services variable info to runtime code
EFI variables can be flagged as being accessible only within boot services.
This makes it awkward for us to figure out how much space they use at
runtime. In theory we could figure this out by simply comparing the results
from QueryVariableInfo() to the space used by all of our variables, but
that fails if the platform doesn't garbage collect on every boot. Thankfully,
calling QueryVariableInfo() while still inside boot services gives a more
reliable answer. This patch passes that information from the EFI boot stub
up to the efi platform code.
Based on a similarly named Linux patch by Matthew Garrett <matthew.garrett@nebula.com>.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Keir Fraser <keir@xen.org>
Acked-by: George Dunlap <george.dunlap@eu.citrix.com>
--- a/xen/arch/x86/efi/boot.c
+++ b/xen/arch/x86/efi/boot.c
@@ -1128,6 +1128,23 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SY
if (efi.smbios != EFI_INVALID_TABLE_ADDR)
dmi_efi_get_table((void *)(long)efi.smbios);
+ /* Get snapshot of variable store parameters. */
+ status = efi_rs->QueryVariableInfo(EFI_VARIABLE_NON_VOLATILE |
+ EFI_VARIABLE_BOOTSERVICE_ACCESS |
+ EFI_VARIABLE_RUNTIME_ACCESS,
+ &efi_boot_max_var_store_size,
+ &efi_boot_remain_var_store_size,
+ &efi_boot_max_var_size);
+ if ( EFI_ERROR(status) )
+ {
+ efi_boot_max_var_store_size = 0;
+ efi_boot_remain_var_store_size = 0;
+ efi_boot_max_var_size = status;
+ PrintStr(L"Warning: Could not query variable store: ");
+ DisplayUint(status, 0);
+ PrintStr(newline);
+ }
+
/* Allocate space for trampoline (in first Mb). */
cfg.addr = 0x100000;
cfg.size = trampoline_end - trampoline_start;
--- a/xen/arch/x86/efi/efi.h
+++ b/xen/arch/x86/efi/efi.h
@@ -22,5 +22,8 @@ extern void *efi_memmap;
extern l4_pgentry_t *efi_l4_pgtable;
+extern UINT64 efi_boot_max_var_store_size, efi_boot_remain_var_store_size,
+ efi_boot_max_var_size;
+
unsigned long efi_rs_enter(void);
void efi_rs_leave(unsigned long);
--- a/xen/arch/x86/efi/runtime.c
+++ b/xen/arch/x86/efi/runtime.c
@@ -28,6 +28,10 @@ UINTN __read_mostly efi_memmap_size;
UINTN __read_mostly efi_mdesc_size;
void *__read_mostly efi_memmap;
+UINT64 __read_mostly efi_boot_max_var_store_size;
+UINT64 __read_mostly efi_boot_remain_var_store_size;
+UINT64 __read_mostly efi_boot_max_var_size;
+
struct efi __read_mostly efi = {
.acpi = EFI_INVALID_TABLE_ADDR,
.acpi20 = EFI_INVALID_TABLE_ADDR,
@@ -446,6 +450,35 @@ int efi_runtime_call(struct xenpf_efi_ru
break;
case XEN_EFI_query_variable_info:
+ if ( op->misc & ~XEN_EFI_VARINFO_BOOT_SNAPSHOT )
+ return -EINVAL;
+
+ if ( op->misc & XEN_EFI_VARINFO_BOOT_SNAPSHOT )
+ {
+ if ( (op->u.query_variable_info.attr
+ & ~EFI_VARIABLE_APPEND_WRITE) !=
+ (EFI_VARIABLE_NON_VOLATILE |
+ EFI_VARIABLE_BOOTSERVICE_ACCESS |
+ EFI_VARIABLE_RUNTIME_ACCESS) )
+ return -EINVAL;
+
+ op->u.query_variable_info.max_store_size =
+ efi_boot_max_var_store_size;
+ op->u.query_variable_info.remain_store_size =
+ efi_boot_remain_var_store_size;
+ if ( efi_boot_max_var_store_size )
+ {
+ op->u.query_variable_info.max_size = efi_boot_max_var_size;
+ status = EFI_SUCCESS;
+ }
+ else
+ {
+ op->u.query_variable_info.max_size = 0;
+ status = efi_boot_max_var_size;
+ }
+ break;
+ }
+
cr3 = efi_rs_enter();
if ( (efi_rs->Hdr.Revision >> 16) < 2 )
{
@@ -462,6 +495,9 @@ int efi_runtime_call(struct xenpf_efi_ru
case XEN_EFI_query_capsule_capabilities:
case XEN_EFI_update_capsule:
+ if ( op->misc )
+ return -EINVAL;
+
cr3 = efi_rs_enter();
if ( (efi_rs->Hdr.Revision >> 16) < 2 )
{
--- a/xen/include/efi/efiapi.h
+++ b/xen/include/efi/efiapi.h
@@ -213,6 +213,10 @@ VOID
#define EFI_VARIABLE_NON_VOLATILE 0x00000001
#define EFI_VARIABLE_BOOTSERVICE_ACCESS 0x00000002
#define EFI_VARIABLE_RUNTIME_ACCESS 0x00000004
+#define EFI_VARIABLE_HARDWARE_ERROR_RECORD 0x00000008
+#define EFI_VARIABLE_AUTHENTICATED_WRITE_ACCESS 0x00000010
+#define EFI_VARIABLE_TIME_BASED_AUTHENTICATED_WRITE_ACCESS 0x00000020
+#define EFI_VARIABLE_APPEND_WRITE 0x00000040
// Variable size limitation
#define EFI_MAXIMUM_VARIABLE_SIZE 1024
--- a/xen/include/public/platform.h
+++ b/xen/include/public/platform.h
@@ -184,6 +184,7 @@ struct xenpf_efi_runtime_call {
struct xenpf_efi_guid vendor_guid;
} get_next_variable_name;
+#define XEN_EFI_VARINFO_BOOT_SNAPSHOT 0x00000001
struct {
uint32_t attr;
uint64_t max_store_size;

View File

@ -0,0 +1,23 @@
# Commit a7ac9597a7fc6ca934957eb78b41e26638281953
# Date 2013-04-29 11:27:54 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/EFI: fix runtime call status for compat mode Dom0
The top two bits (indicating error/warning classification) need to
remain the top two bits.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Keir Fraser <keir@xen.org>
--- a/xen/arch/x86/efi/runtime.c
+++ b/xen/arch/x86/efi/runtime.c
@@ -513,7 +513,7 @@ int efi_runtime_call(struct xenpf_efi_ru
#ifndef COMPAT
op->status = status;
#else
- op->status = (status & 0x3fffffff) | (status >> 62);
+ op->status = (status & 0x3fffffff) | ((status >> 32) & 0xc0000000);
#endif
return rc;

View File

@ -1,8 +1,8 @@
Index: xen-4.2.1-testing/tools/python/xen/xend/XendDomainInfo.py
Index: xen-4.2.2-testing/tools/python/xen/xend/XendDomainInfo.py
===================================================================
--- xen-4.2.1-testing.orig/tools/python/xen/xend/XendDomainInfo.py
+++ xen-4.2.1-testing/tools/python/xen/xend/XendDomainInfo.py
@@ -2984,7 +2984,7 @@ class XendDomainInfo:
--- xen-4.2.2-testing.orig/tools/python/xen/xend/XendDomainInfo.py
+++ xen-4.2.2-testing/tools/python/xen/xend/XendDomainInfo.py
@@ -2985,7 +2985,7 @@ class XendDomainInfo:
self.guest_bitsize = self.image.getBitSize()
# Make sure there's enough RAM available for the domain

View File

@ -1,88 +0,0 @@
Subject: e1000: Discard packets that are too long if !SBP and !LPE
From: Michael Contreras michael@inetric.com Sun Dec 2 20:11:22 2012 -0800
Date: Wed Jan 16 14:12:40 2013 +0000:
Git: b4e9b8169dedc0bcf0d3abe07642f761ac70aeea
The e1000_receive function for the e1000 needs to discard packets longer than
1522 bytes if the SBP and LPE flags are disabled. The linux driver assumes
this behavior and allocates memory based on this assumption.
Signed-off-by: Michael Contreras <michael@inetric.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
Subject: e1000: Discard oversized packets based on SBP|LPE
From: Michael Contreras <michael@inetric.com>
Date: Wed, 5 Dec 2012 18:31:30 +0000 (-0500)
e1000: Discard oversized packets based on SBP|LPE
Discard packets longer than 16384 when !SBP to match the hardware behavior.
Signed-off-by: Michael Contreras <michael@inetric.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
[ This is a security vulnerability, CVE-2012-6075 / XSA-41. ]
(cherry picked from commit 4c2cae2a882db4d2a231b27b3b31a5bbec6dacbf)
Index: xen-4.2.1-testing/tools/qemu-xen-traditional-dir-remote/hw/e1000.c
===================================================================
--- xen-4.2.1-testing.orig/tools/qemu-xen-traditional-dir-remote/hw/e1000.c
+++ xen-4.2.1-testing/tools/qemu-xen-traditional-dir-remote/hw/e1000.c
@@ -55,6 +55,11 @@ static int debugflags = DBGBIT(TXERR) |
#define REG_IOADDR 0x0
#define REG_IODATA 0x4
+/* this is the size past which hardware will drop packets when setting LPE=0 */
+#define MAXIMUM_ETHERNET_VLAN_SIZE 1522
+/* this is the size past which hardware will drop packets when setting LPE=1 */
+#define MAXIMUM_ETHERNET_LPE_SIZE 16384
+
/*
* HW models:
* E1000_DEV_ID_82540EM works with Windows and Linux
@@ -628,6 +633,14 @@ e1000_receive(void *opaque, const uint8_
return;
}
+ /* Discard oversized packets if !LPE and !SBP. */
+ if ((size > MAXIMUM_ETHERNET_LPE_SIZE ||
+ (size > MAXIMUM_ETHERNET_VLAN_SIZE
+ && !(s->mac_reg[RCTL] & E1000_RCTL_LPE)))
+ && !(s->mac_reg[RCTL] & E1000_RCTL_SBP)) {
+ return;
+ }
+
if (!receive_filter(s, buf, size))
return;
Index: xen-4.2.1-testing/tools/qemu-xen-dir-remote/hw/e1000.c
===================================================================
--- xen-4.2.1-testing.orig/tools/qemu-xen-dir-remote/hw/e1000.c
+++ xen-4.2.1-testing/tools/qemu-xen-dir-remote/hw/e1000.c
@@ -59,6 +59,11 @@ static int debugflags = DBGBIT(TXERR) |
#define PNPMMIO_SIZE 0x20000
#define MIN_BUF_SIZE 60 /* Min. octets in an ethernet frame sans FCS */
+/* this is the size past which hardware will drop packets when setting LPE=0 */
+#define MAXIMUM_ETHERNET_VLAN_SIZE 1522
+/* this is the size past which hardware will drop packets when setting LPE=1 */
+#define MAXIMUM_ETHERNET_LPE_SIZE 16384
+
/*
* HW models:
* E1000_DEV_ID_82540EM works with Windows and Linux
@@ -693,6 +698,14 @@ e1000_receive(VLANClientState *nc, const
size = sizeof(min_buf);
}
+ /* Discard oversized packets if !LPE and !SBP. */
+ if ((size > MAXIMUM_ETHERNET_LPE_SIZE ||
+ (size > MAXIMUM_ETHERNET_VLAN_SIZE
+ && !(s->mac_reg[RCTL] & E1000_RCTL_LPE)))
+ && !(s->mac_reg[RCTL] & E1000_RCTL_SBP)) {
+ return size;
+ }
+
if (!receive_filter(s, buf, size))
return size;

View File

@ -1,32 +0,0 @@
References: CVE-2013-0151 XSA-34 bnc#797285
x86_32: don't allow use of nested HVM
There are (indirect) uses of map_domain_page() in the nested HVM code
that are unsafe when not just using the 1:1 mapping.
This is XSA-34 / CVE-2013-0151.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3930,6 +3930,10 @@ long do_hvm_op(unsigned long op, XEN_GUE
rc = -EINVAL;
break;
case HVM_PARAM_NESTEDHVM:
+#ifdef __i386__
+ if ( a.value )
+ rc = -EINVAL;
+#else
if ( a.value > 1 )
rc = -EINVAL;
if ( !is_hvm_domain(d) )
@@ -3944,6 +3948,7 @@ long do_hvm_op(unsigned long op, XEN_GUE
for_each_vcpu(d, v)
if ( rc == 0 )
rc = nestedhvm_vcpu_initialise(v);
+#endif
break;
case HVM_PARAM_BUFIOREQ_EVTCHN:
rc = -EINVAL;

View File

@ -0,0 +1,262 @@
x86: make vcpu_destroy_pagetables() preemptible
... as it may take significant amounts of time.
The function, being moved to mm.c as the better home for it anyway, and
to avoid having to make a new helper function there non-static, is
given a "preemptible" parameter temporarily (until, in a subsequent
patch, its other caller is also being made capable of dealing with
preemption).
This is part of CVE-2013-1918 / XSA-45.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
Index: xen-4.2.1-testing/xen/arch/x86/domain.c
===================================================================
--- xen-4.2.1-testing.orig/xen/arch/x86/domain.c
+++ xen-4.2.1-testing/xen/arch/x86/domain.c
@@ -73,8 +73,6 @@ void (*dead_idle) (void) __read_mostly =
static void paravirt_ctxt_switch_from(struct vcpu *v);
static void paravirt_ctxt_switch_to(struct vcpu *v);
-static void vcpu_destroy_pagetables(struct vcpu *v);
-
static void default_idle(void)
{
local_irq_disable();
@@ -1058,7 +1056,7 @@ void arch_vcpu_reset(struct vcpu *v)
if ( !is_hvm_vcpu(v) )
{
destroy_gdt(v);
- vcpu_destroy_pagetables(v);
+ vcpu_destroy_pagetables(v, 0);
}
else
{
@@ -2069,63 +2067,6 @@ static int relinquish_memory(
return ret;
}
-static void vcpu_destroy_pagetables(struct vcpu *v)
-{
- struct domain *d = v->domain;
- unsigned long pfn;
-
-#ifdef __x86_64__
- if ( is_pv_32on64_vcpu(v) )
- {
- pfn = l4e_get_pfn(*(l4_pgentry_t *)
- __va(pagetable_get_paddr(v->arch.guest_table)));
-
- if ( pfn != 0 )
- {
- if ( paging_mode_refcounts(d) )
- put_page(mfn_to_page(pfn));
- else
- put_page_and_type(mfn_to_page(pfn));
- }
-
- l4e_write(
- (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)),
- l4e_empty());
-
- v->arch.cr3 = 0;
- return;
- }
-#endif
-
- pfn = pagetable_get_pfn(v->arch.guest_table);
- if ( pfn != 0 )
- {
- if ( paging_mode_refcounts(d) )
- put_page(mfn_to_page(pfn));
- else
- put_page_and_type(mfn_to_page(pfn));
- v->arch.guest_table = pagetable_null();
- }
-
-#ifdef __x86_64__
- /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
- pfn = pagetable_get_pfn(v->arch.guest_table_user);
- if ( pfn != 0 )
- {
- if ( !is_pv_32bit_vcpu(v) )
- {
- if ( paging_mode_refcounts(d) )
- put_page(mfn_to_page(pfn));
- else
- put_page_and_type(mfn_to_page(pfn));
- }
- v->arch.guest_table_user = pagetable_null();
- }
-#endif
-
- v->arch.cr3 = 0;
-}
-
int domain_relinquish_resources(struct domain *d)
{
int ret;
@@ -2143,7 +2084,11 @@ int domain_relinquish_resources(struct d
/* Drop the in-use references to page-table bases. */
for_each_vcpu ( d, v )
- vcpu_destroy_pagetables(v);
+ {
+ ret = vcpu_destroy_pagetables(v, 1);
+ if ( ret )
+ return ret;
+ }
if ( !is_hvm_domain(d) )
{
Index: xen-4.2.1-testing/xen/arch/x86/mm.c
===================================================================
--- xen-4.2.1-testing.orig/xen/arch/x86/mm.c
+++ xen-4.2.1-testing/xen/arch/x86/mm.c
@@ -2825,6 +2825,82 @@ static void put_superpage(unsigned long
#endif
+static int put_old_guest_table(struct vcpu *v)
+{
+ int rc;
+
+ if ( !v->arch.old_guest_table )
+ return 0;
+
+ switch ( rc = put_page_and_type_preemptible(v->arch.old_guest_table, 1) )
+ {
+ case -EINTR:
+ case -EAGAIN:
+ return -EAGAIN;
+ }
+
+ v->arch.old_guest_table = NULL;
+
+ return rc;
+}
+
+int vcpu_destroy_pagetables(struct vcpu *v, bool_t preemptible)
+{
+ unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
+ struct page_info *page;
+ int rc = put_old_guest_table(v);
+
+ if ( rc )
+ return rc;
+
+#ifdef __x86_64__
+ if ( is_pv_32on64_vcpu(v) )
+ mfn = l4e_get_pfn(*(l4_pgentry_t *)mfn_to_virt(mfn));
+#endif
+
+ if ( mfn )
+ {
+ page = mfn_to_page(mfn);
+ if ( paging_mode_refcounts(v->domain) )
+ put_page(page);
+ else
+ rc = put_page_and_type_preemptible(page, preemptible);
+ }
+
+#ifdef __x86_64__
+ if ( is_pv_32on64_vcpu(v) )
+ {
+ if ( !rc )
+ l4e_write(
+ (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)),
+ l4e_empty());
+ }
+ else
+#endif
+ if ( !rc )
+ {
+ v->arch.guest_table = pagetable_null();
+
+#ifdef __x86_64__
+ /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
+ mfn = pagetable_get_pfn(v->arch.guest_table_user);
+ if ( mfn )
+ {
+ page = mfn_to_page(mfn);
+ if ( paging_mode_refcounts(v->domain) )
+ put_page(page);
+ else
+ rc = put_page_and_type_preemptible(page, preemptible);
+ }
+ if ( !rc )
+ v->arch.guest_table_user = pagetable_null();
+#endif
+ }
+
+ v->arch.cr3 = 0;
+
+ return rc;
+}
int new_guest_cr3(unsigned long mfn)
{
@@ -3011,12 +3087,21 @@ long do_mmuext_op(
unsigned int foreigndom)
{
struct mmuext_op op;
- int rc = 0, i = 0, okay;
unsigned long type;
- unsigned int done = 0;
+ unsigned int i = 0, done = 0;
struct vcpu *curr = current;
struct domain *d = curr->domain;
struct domain *pg_owner;
+ int okay, rc = put_old_guest_table(curr);
+
+ if ( unlikely(rc) )
+ {
+ if ( likely(rc == -EAGAIN) )
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmuext_op, "hihi", uops, count, pdone,
+ foreigndom);
+ return rc;
+ }
if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
{
Index: xen-4.2.1-testing/xen/arch/x86/x86_64/compat/mm.c
===================================================================
--- xen-4.2.1-testing.orig/xen/arch/x86/x86_64/compat/mm.c
+++ xen-4.2.1-testing/xen/arch/x86/x86_64/compat/mm.c
@@ -365,7 +365,7 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mm
: mcs->call.args[1];
unsigned int left = arg1 & ~MMU_UPDATE_PREEMPTED;
- BUG_ON(left == arg1);
+ BUG_ON(left == arg1 && left != i);
BUG_ON(left > count);
guest_handle_add_offset(nat_ops, i - left);
guest_handle_subtract_offset(cmp_uops, left);
Index: xen-4.2.1-testing/xen/include/asm-x86/domain.h
===================================================================
--- xen-4.2.1-testing.orig/xen/include/asm-x86/domain.h
+++ xen-4.2.1-testing/xen/include/asm-x86/domain.h
@@ -464,6 +464,7 @@ struct arch_vcpu
pagetable_t guest_table_user; /* (MFN) x86/64 user-space pagetable */
#endif
pagetable_t guest_table; /* (MFN) guest notion of cr3 */
+ struct page_info *old_guest_table; /* partially destructed pagetable */
/* guest_table holds a ref to the page, and also a type-count unless
* shadow refcounts are in use */
pagetable_t shadow_table[4]; /* (MFN) shadow(s) of guest */
Index: xen-4.2.1-testing/xen/include/asm-x86/mm.h
===================================================================
--- xen-4.2.1-testing.orig/xen/include/asm-x86/mm.h
+++ xen-4.2.1-testing/xen/include/asm-x86/mm.h
@@ -605,6 +605,7 @@ void audit_domains(void);
int new_guest_cr3(unsigned long pfn);
void make_cr3(struct vcpu *v, unsigned long mfn);
void update_cr3(struct vcpu *v);
+int vcpu_destroy_pagetables(struct vcpu *, bool_t preemptible);
void propagate_page_fault(unsigned long addr, u16 error_code);
void *do_page_walk(struct vcpu *v, unsigned long addr);

View File

@ -0,0 +1,173 @@
x86: make new_guest_cr3() preemptible
... as it may take significant amounts of time.
This is part of CVE-2013-1918 / XSA-45.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
Index: xen-4.2.1-testing/xen/arch/x86/mm.c
===================================================================
--- xen-4.2.1-testing.orig/xen/arch/x86/mm.c
+++ xen-4.2.1-testing/xen/arch/x86/mm.c
@@ -2906,44 +2906,69 @@ int new_guest_cr3(unsigned long mfn)
{
struct vcpu *curr = current;
struct domain *d = curr->domain;
- int okay;
+ int rc;
unsigned long old_base_mfn;
#ifdef __x86_64__
if ( is_pv_32on64_domain(d) )
{
- okay = paging_mode_refcounts(d)
- ? 0 /* Old code was broken, but what should it be? */
- : mod_l4_entry(
+ rc = paging_mode_refcounts(d)
+ ? -EINVAL /* Old code was broken, but what should it be? */
+ : mod_l4_entry(
__va(pagetable_get_paddr(curr->arch.guest_table)),
l4e_from_pfn(
mfn,
(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
- pagetable_get_pfn(curr->arch.guest_table), 0, 0, curr) == 0;
- if ( unlikely(!okay) )
+ pagetable_get_pfn(curr->arch.guest_table), 0, 1, curr);
+ switch ( rc )
{
+ case 0:
+ break;
+ case -EINTR:
+ case -EAGAIN:
+ return -EAGAIN;
+ default:
MEM_LOG("Error while installing new compat baseptr %lx", mfn);
- return 0;
+ return rc;
}
invalidate_shadow_ldt(curr, 0);
write_ptbase(curr);
- return 1;
+ return 0;
}
#endif
- okay = paging_mode_refcounts(d)
- ? get_page_from_pagenr(mfn, d)
- : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0);
- if ( unlikely(!okay) )
+ rc = put_old_guest_table(curr);
+ if ( unlikely(rc) )
+ return rc;
+
+ old_base_mfn = pagetable_get_pfn(curr->arch.guest_table);
+ /*
+ * This is particularly important when getting restarted after the
+ * previous attempt got preempted in the put-old-MFN phase.
+ */
+ if ( old_base_mfn == mfn )
{
- MEM_LOG("Error while installing new baseptr %lx", mfn);
+ write_ptbase(curr);
return 0;
}
- invalidate_shadow_ldt(curr, 0);
+ rc = paging_mode_refcounts(d)
+ ? (get_page_from_pagenr(mfn, d) ? 0 : -EINVAL)
+ : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 1);
+ switch ( rc )
+ {
+ case 0:
+ break;
+ case -EINTR:
+ case -EAGAIN:
+ return -EAGAIN;
+ default:
+ MEM_LOG("Error while installing new baseptr %lx", mfn);
+ return rc;
+ }
- old_base_mfn = pagetable_get_pfn(curr->arch.guest_table);
+ invalidate_shadow_ldt(curr, 0);
curr->arch.guest_table = pagetable_from_pfn(mfn);
update_cr3(curr);
@@ -2952,13 +2977,25 @@ int new_guest_cr3(unsigned long mfn)
if ( likely(old_base_mfn != 0) )
{
+ struct page_info *page = mfn_to_page(old_base_mfn);
+
if ( paging_mode_refcounts(d) )
- put_page(mfn_to_page(old_base_mfn));
+ put_page(page);
else
- put_page_and_type(mfn_to_page(old_base_mfn));
+ switch ( rc = put_page_and_type_preemptible(page, 1) )
+ {
+ case -EINTR:
+ rc = -EAGAIN;
+ case -EAGAIN:
+ curr->arch.old_guest_table = page;
+ break;
+ default:
+ BUG_ON(rc);
+ break;
+ }
}
- return 1;
+ return rc;
}
static struct domain *get_pg_owner(domid_t domid)
@@ -3256,8 +3293,13 @@ long do_mmuext_op(
}
case MMUEXT_NEW_BASEPTR:
- okay = (!paging_mode_translate(d)
- && new_guest_cr3(op.arg1.mfn));
+ if ( paging_mode_translate(d) )
+ okay = 0;
+ else
+ {
+ rc = new_guest_cr3(op.arg1.mfn);
+ okay = !rc;
+ }
break;
Index: xen-4.2.1-testing/xen/arch/x86/traps.c
===================================================================
--- xen-4.2.1-testing.orig/xen/arch/x86/traps.c
+++ xen-4.2.1-testing/xen/arch/x86/traps.c
@@ -2407,12 +2407,23 @@ static int emulate_privileged_op(struct
#endif
}
page = get_page_from_gfn(v->domain, gfn, NULL, P2M_ALLOC);
- rc = page ? new_guest_cr3(page_to_mfn(page)) : 0;
if ( page )
+ {
+ rc = new_guest_cr3(page_to_mfn(page));
put_page(page);
+ }
+ else
+ rc = -EINVAL;
domain_unlock(v->domain);
- if ( rc == 0 ) /* not okay */
+ switch ( rc )
+ {
+ case 0:
+ break;
+ case -EAGAIN: /* retry after preemption */
+ goto skip;
+ default: /* not okay */
goto fail;
+ }
break;
}

View File

@ -0,0 +1,76 @@
x86: make MMUEXT_NEW_USER_BASEPTR preemptible
... as it may take significant amounts of time.
This is part of CVE-2013-1918 / XSA-45.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
Index: xen-4.2.1-testing/xen/arch/x86/mm.c
===================================================================
--- xen-4.2.1-testing.orig/xen/arch/x86/mm.c
+++ xen-4.2.1-testing/xen/arch/x86/mm.c
@@ -3313,29 +3313,56 @@ long do_mmuext_op(
break;
}
+ old_mfn = pagetable_get_pfn(curr->arch.guest_table_user);
+ /*
+ * This is particularly important when getting restarted after the
+ * previous attempt got preempted in the put-old-MFN phase.
+ */
+ if ( old_mfn == op.arg1.mfn )
+ break;
+
if ( op.arg1.mfn != 0 )
{
if ( paging_mode_refcounts(d) )
okay = get_page_from_pagenr(op.arg1.mfn, d);
else
- okay = !get_page_and_type_from_pagenr(
- op.arg1.mfn, PGT_root_page_table, d, 0, 0);
+ {
+ rc = get_page_and_type_from_pagenr(
+ op.arg1.mfn, PGT_root_page_table, d, 0, 1);
+ okay = !rc;
+ }
if ( unlikely(!okay) )
{
- MEM_LOG("Error while installing new mfn %lx", op.arg1.mfn);
+ if ( rc == -EINTR )
+ rc = -EAGAIN;
+ else if ( rc != -EAGAIN )
+ MEM_LOG("Error while installing new mfn %lx",
+ op.arg1.mfn);
break;
}
}
- old_mfn = pagetable_get_pfn(curr->arch.guest_table_user);
curr->arch.guest_table_user = pagetable_from_pfn(op.arg1.mfn);
if ( old_mfn != 0 )
{
+ struct page_info *page = mfn_to_page(old_mfn);
+
if ( paging_mode_refcounts(d) )
- put_page(mfn_to_page(old_mfn));
+ put_page(page);
else
- put_page_and_type(mfn_to_page(old_mfn));
+ switch ( rc = put_page_and_type_preemptible(page, 1) )
+ {
+ case -EINTR:
+ rc = -EAGAIN;
+ case -EAGAIN:
+ curr->arch.old_guest_table = page;
+ okay = 0;
+ break;
+ default:
+ BUG_ON(rc);
+ break;
+ }
}
break;

View File

@ -0,0 +1,218 @@
x86: make vcpu_reset() preemptible
... as dropping the old page tables may take significant amounts of
time.
This is part of CVE-2013-1918 / XSA-45.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
Index: xen-4.2.2-testing/xen/arch/x86/domain.c
===================================================================
--- xen-4.2.2-testing.orig/xen/arch/x86/domain.c
+++ xen-4.2.2-testing/xen/arch/x86/domain.c
@@ -1051,17 +1051,16 @@ int arch_set_info_guest(
#undef c
}
-void arch_vcpu_reset(struct vcpu *v)
+int arch_vcpu_reset(struct vcpu *v)
{
if ( !is_hvm_vcpu(v) )
{
destroy_gdt(v);
- vcpu_destroy_pagetables(v, 0);
- }
- else
- {
- vcpu_end_shutdown_deferral(v);
+ return vcpu_destroy_pagetables(v);
}
+
+ vcpu_end_shutdown_deferral(v);
+ return 0;
}
/*
@@ -2085,7 +2084,7 @@ int domain_relinquish_resources(struct d
/* Drop the in-use references to page-table bases. */
for_each_vcpu ( d, v )
{
- ret = vcpu_destroy_pagetables(v, 1);
+ ret = vcpu_destroy_pagetables(v);
if ( ret )
return ret;
}
Index: xen-4.2.2-testing/xen/arch/x86/hvm/hvm.c
===================================================================
--- xen-4.2.2-testing.orig/xen/arch/x86/hvm/hvm.c
+++ xen-4.2.2-testing/xen/arch/x86/hvm/hvm.c
@@ -3577,8 +3577,11 @@ static void hvm_s3_suspend(struct domain
for_each_vcpu ( d, v )
{
+ int rc;
+
vlapic_reset(vcpu_vlapic(v));
- vcpu_reset(v);
+ rc = vcpu_reset(v);
+ ASSERT(!rc);
}
vpic_reset(d);
Index: xen-4.2.2-testing/xen/arch/x86/hvm/vlapic.c
===================================================================
--- xen-4.2.2-testing.orig/xen/arch/x86/hvm/vlapic.c
+++ xen-4.2.2-testing/xen/arch/x86/hvm/vlapic.c
@@ -255,10 +255,13 @@ static void vlapic_init_sipi_action(unsi
{
case APIC_DM_INIT: {
bool_t fpu_initialised;
+ int rc;
+
domain_lock(target->domain);
/* Reset necessary VCPU state. This does not include FPU state. */
fpu_initialised = target->fpu_initialised;
- vcpu_reset(target);
+ rc = vcpu_reset(target);
+ ASSERT(!rc);
target->fpu_initialised = fpu_initialised;
vlapic_reset(vcpu_vlapic(target));
domain_unlock(target->domain);
Index: xen-4.2.2-testing/xen/arch/x86/mm.c
===================================================================
--- xen-4.2.2-testing.orig/xen/arch/x86/mm.c
+++ xen-4.2.2-testing/xen/arch/x86/mm.c
@@ -2844,7 +2844,7 @@ static int put_old_guest_table(struct vc
return rc;
}
-int vcpu_destroy_pagetables(struct vcpu *v, bool_t preemptible)
+int vcpu_destroy_pagetables(struct vcpu *v)
{
unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
struct page_info *page;
@@ -2864,7 +2864,7 @@ int vcpu_destroy_pagetables(struct vcpu
if ( paging_mode_refcounts(v->domain) )
put_page(page);
else
- rc = put_page_and_type_preemptible(page, preemptible);
+ rc = put_page_and_type_preemptible(page, 1);
}
#ifdef __x86_64__
@@ -2890,7 +2890,7 @@ int vcpu_destroy_pagetables(struct vcpu
if ( paging_mode_refcounts(v->domain) )
put_page(page);
else
- rc = put_page_and_type_preemptible(page, preemptible);
+ rc = put_page_and_type_preemptible(page, 1);
}
if ( !rc )
v->arch.guest_table_user = pagetable_null();
Index: xen-4.2.2-testing/xen/common/domain.c
===================================================================
--- xen-4.2.2-testing.orig/xen/common/domain.c
+++ xen-4.2.2-testing/xen/common/domain.c
@@ -779,14 +779,18 @@ void domain_unpause_by_systemcontroller(
domain_unpause(d);
}
-void vcpu_reset(struct vcpu *v)
+int vcpu_reset(struct vcpu *v)
{
struct domain *d = v->domain;
+ int rc;
vcpu_pause(v);
domain_lock(d);
- arch_vcpu_reset(v);
+ set_bit(_VPF_in_reset, &v->pause_flags);
+ rc = arch_vcpu_reset(v);
+ if ( rc )
+ goto out_unlock;
set_bit(_VPF_down, &v->pause_flags);
@@ -802,9 +806,13 @@ void vcpu_reset(struct vcpu *v)
#endif
cpumask_clear(v->cpu_affinity_tmp);
clear_bit(_VPF_blocked, &v->pause_flags);
+ clear_bit(_VPF_in_reset, &v->pause_flags);
+ out_unlock:
domain_unlock(v->domain);
vcpu_unpause(v);
+
+ return rc;
}
Index: xen-4.2.2-testing/xen/common/domctl.c
===================================================================
--- xen-4.2.2-testing.orig/xen/common/domctl.c
+++ xen-4.2.2-testing/xen/common/domctl.c
@@ -307,8 +307,10 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
if ( guest_handle_is_null(op->u.vcpucontext.ctxt) )
{
- vcpu_reset(v);
- ret = 0;
+ ret = vcpu_reset(v);
+ if ( ret == -EAGAIN )
+ ret = hypercall_create_continuation(
+ __HYPERVISOR_domctl, "h", u_domctl);
goto svc_out;
}
Index: xen-4.2.2-testing/xen/include/asm-x86/mm.h
===================================================================
--- xen-4.2.2-testing.orig/xen/include/asm-x86/mm.h
+++ xen-4.2.2-testing/xen/include/asm-x86/mm.h
@@ -605,7 +605,7 @@ void audit_domains(void);
int new_guest_cr3(unsigned long pfn);
void make_cr3(struct vcpu *v, unsigned long mfn);
void update_cr3(struct vcpu *v);
-int vcpu_destroy_pagetables(struct vcpu *, bool_t preemptible);
+int vcpu_destroy_pagetables(struct vcpu *);
void propagate_page_fault(unsigned long addr, u16 error_code);
void *do_page_walk(struct vcpu *v, unsigned long addr);
Index: xen-4.2.2-testing/xen/include/xen/domain.h
===================================================================
--- xen-4.2.2-testing.orig/xen/include/xen/domain.h
+++ xen-4.2.2-testing/xen/include/xen/domain.h
@@ -13,7 +13,7 @@ typedef union {
struct vcpu *alloc_vcpu(
struct domain *d, unsigned int vcpu_id, unsigned int cpu_id);
struct vcpu *alloc_dom0_vcpu0(void);
-void vcpu_reset(struct vcpu *v);
+int vcpu_reset(struct vcpu *);
struct xen_domctl_getdomaininfo;
void getdomaininfo(struct domain *d, struct xen_domctl_getdomaininfo *info);
@@ -67,7 +67,7 @@ void arch_dump_vcpu_info(struct vcpu *v)
void arch_dump_domain_info(struct domain *d);
-void arch_vcpu_reset(struct vcpu *v);
+int arch_vcpu_reset(struct vcpu *);
extern spinlock_t vcpu_alloc_lock;
bool_t domctl_lock_acquire(void);
Index: xen-4.2.2-testing/xen/include/xen/sched.h
===================================================================
--- xen-4.2.2-testing.orig/xen/include/xen/sched.h
+++ xen-4.2.2-testing/xen/include/xen/sched.h
@@ -644,6 +644,9 @@ static inline struct domain *next_domain
/* VCPU is blocked due to missing mem_sharing ring. */
#define _VPF_mem_sharing 6
#define VPF_mem_sharing (1UL<<_VPF_mem_sharing)
+ /* VCPU is being reset. */
+#define _VPF_in_reset 7
+#define VPF_in_reset (1UL<<_VPF_in_reset)
static inline int vcpu_runnable(struct vcpu *v)
{

View File

@ -0,0 +1,212 @@
x86: make arch_set_info_guest() preemptible
.. as the root page table validation (and the dropping of an eventual
old one) can require meaningful amounts of time.
This is part of CVE-2013-1918 / XSA-45.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
Index: xen-4.2.2-testing/xen/arch/x86/domain.c
===================================================================
--- xen-4.2.2-testing.orig/xen/arch/x86/domain.c
+++ xen-4.2.2-testing/xen/arch/x86/domain.c
@@ -858,6 +858,9 @@ int arch_set_info_guest(
if ( !v->is_initialised )
{
+ if ( !compat && !(flags & VGCF_in_kernel) && !c.nat->ctrlreg[1] )
+ return -EINVAL;
+
v->arch.pv_vcpu.ldt_base = c(ldt_base);
v->arch.pv_vcpu.ldt_ents = c(ldt_ents);
}
@@ -955,24 +958,44 @@ int arch_set_info_guest(
if ( rc != 0 )
return rc;
+ set_bit(_VPF_in_reset, &v->pause_flags);
+
if ( !compat )
- {
cr3_gfn = xen_cr3_to_pfn(c.nat->ctrlreg[3]);
- cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC);
-
- if ( !cr3_page )
- {
- destroy_gdt(v);
- return -EINVAL;
- }
- if ( !paging_mode_refcounts(d)
- && !get_page_type(cr3_page, PGT_base_page_table) )
- {
- put_page(cr3_page);
- destroy_gdt(v);
- return -EINVAL;
- }
+#ifdef CONFIG_COMPAT
+ else
+ cr3_gfn = compat_cr3_to_pfn(c.cmp->ctrlreg[3]);
+#endif
+ cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC);
+ if ( !cr3_page )
+ rc = -EINVAL;
+ else if ( paging_mode_refcounts(d) )
+ /* nothing */;
+ else if ( cr3_page == v->arch.old_guest_table )
+ {
+ v->arch.old_guest_table = NULL;
+ put_page(cr3_page);
+ }
+ else
+ {
+ /*
+ * Since v->arch.guest_table{,_user} are both NULL, this effectively
+ * is just a call to put_old_guest_table().
+ */
+ if ( !compat )
+ rc = vcpu_destroy_pagetables(v);
+ if ( !rc )
+ rc = get_page_type_preemptible(cr3_page,
+ !compat ? PGT_root_page_table
+ : PGT_l3_page_table);
+ if ( rc == -EINTR )
+ rc = -EAGAIN;
+ }
+ if ( rc )
+ /* handled below */;
+ else if ( !compat )
+ {
v->arch.guest_table = pagetable_from_page(cr3_page);
#ifdef __x86_64__
if ( c.nat->ctrlreg[1] )
@@ -980,56 +1003,44 @@ int arch_set_info_guest(
cr3_gfn = xen_cr3_to_pfn(c.nat->ctrlreg[1]);
cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC);
- if ( !cr3_page ||
- (!paging_mode_refcounts(d)
- && !get_page_type(cr3_page, PGT_base_page_table)) )
+ if ( !cr3_page )
+ rc = -EINVAL;
+ else if ( !paging_mode_refcounts(d) )
{
- if (cr3_page)
- put_page(cr3_page);
- cr3_page = pagetable_get_page(v->arch.guest_table);
- v->arch.guest_table = pagetable_null();
- if ( paging_mode_refcounts(d) )
- put_page(cr3_page);
- else
- put_page_and_type(cr3_page);
- destroy_gdt(v);
- return -EINVAL;
+ rc = get_page_type_preemptible(cr3_page, PGT_root_page_table);
+ switch ( rc )
+ {
+ case -EINTR:
+ rc = -EAGAIN;
+ case -EAGAIN:
+ v->arch.old_guest_table =
+ pagetable_get_page(v->arch.guest_table);
+ v->arch.guest_table = pagetable_null();
+ break;
+ }
}
-
- v->arch.guest_table_user = pagetable_from_page(cr3_page);
- }
- else if ( !(flags & VGCF_in_kernel) )
- {
- destroy_gdt(v);
- return -EINVAL;
+ if ( !rc )
+ v->arch.guest_table_user = pagetable_from_page(cr3_page);
}
}
else
{
l4_pgentry_t *l4tab;
- cr3_gfn = compat_cr3_to_pfn(c.cmp->ctrlreg[3]);
- cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC);
-
- if ( !cr3_page)
- {
- destroy_gdt(v);
- return -EINVAL;
- }
-
- if (!paging_mode_refcounts(d)
- && !get_page_type(cr3_page, PGT_l3_page_table) )
- {
- put_page(cr3_page);
- destroy_gdt(v);
- return -EINVAL;
- }
-
l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
*l4tab = l4e_from_pfn(page_to_mfn(cr3_page),
_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
#endif
}
+ if ( rc )
+ {
+ if ( cr3_page )
+ put_page(cr3_page);
+ destroy_gdt(v);
+ return rc;
+ }
+
+ clear_bit(_VPF_in_reset, &v->pause_flags);
if ( v->vcpu_id == 0 )
update_domain_wallclock_time(d);
Index: xen-4.2.2-testing/xen/common/compat/domain.c
===================================================================
--- xen-4.2.2-testing.orig/xen/common/compat/domain.c
+++ xen-4.2.2-testing/xen/common/compat/domain.c
@@ -50,6 +50,10 @@ int compat_vcpu_op(int cmd, int vcpuid,
rc = v->is_initialised ? -EEXIST : arch_set_info_guest(v, cmp_ctxt);
domain_unlock(d);
+ if ( rc == -EAGAIN )
+ rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih",
+ cmd, vcpuid, arg);
+
xfree(cmp_ctxt);
break;
}
Index: xen-4.2.2-testing/xen/common/domain.c
===================================================================
--- xen-4.2.2-testing.orig/xen/common/domain.c
+++ xen-4.2.2-testing/xen/common/domain.c
@@ -849,6 +849,11 @@ long do_vcpu_op(int cmd, int vcpuid, XEN
domain_unlock(d);
free_vcpu_guest_context(ctxt);
+
+ if ( rc == -EAGAIN )
+ rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih",
+ cmd, vcpuid, arg);
+
break;
case VCPUOP_up: {
Index: xen-4.2.2-testing/xen/common/domctl.c
===================================================================
--- xen-4.2.2-testing.orig/xen/common/domctl.c
+++ xen-4.2.2-testing/xen/common/domctl.c
@@ -339,6 +339,10 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
domain_pause(d);
ret = arch_set_info_guest(v, c);
domain_unpause(d);
+
+ if ( ret == -EAGAIN )
+ ret = hypercall_create_continuation(
+ __HYPERVISOR_domctl, "h", u_domctl);
}
svc_out:

View File

@ -0,0 +1,131 @@
x86: make page table unpinning preemptible
... as it may take significant amounts of time.
Since we can't re-invoke the operation in a second attempt, the
continuation logic must be slightly tweaked so that we make sure
do_mmuext_op() gets run one more time even when the preempted unpin
operation was the last one in a batch.
This is part of CVE-2013-1918 / XSA-45.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
Index: xen-4.2.1-testing/xen/arch/x86/mm.c
===================================================================
--- xen-4.2.1-testing.orig/xen/arch/x86/mm.c
+++ xen-4.2.1-testing/xen/arch/x86/mm.c
@@ -3140,6 +3140,14 @@ long do_mmuext_op(
return rc;
}
+ if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
+ likely(guest_handle_is_null(uops)) )
+ {
+ /* See the curr->arch.old_guest_table related
+ * hypercall_create_continuation() below. */
+ return (int)foreigndom;
+ }
+
if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
{
count &= ~MMU_UPDATE_PREEMPTED;
@@ -3163,7 +3171,7 @@ long do_mmuext_op(
for ( i = 0; i < count; i++ )
{
- if ( hypercall_preempt_check() )
+ if ( curr->arch.old_guest_table || hypercall_preempt_check() )
{
rc = -EAGAIN;
break;
@@ -3283,7 +3291,17 @@ long do_mmuext_op(
break;
}
- put_page_and_type(page);
+ switch ( rc = put_page_and_type_preemptible(page, 1) )
+ {
+ case -EINTR:
+ case -EAGAIN:
+ curr->arch.old_guest_table = page;
+ rc = 0;
+ break;
+ default:
+ BUG_ON(rc);
+ break;
+ }
put_page(page);
/* A page is dirtied when its pin status is cleared. */
@@ -3604,9 +3622,27 @@ long do_mmuext_op(
}
if ( rc == -EAGAIN )
+ {
+ ASSERT(i < count);
rc = hypercall_create_continuation(
__HYPERVISOR_mmuext_op, "hihi",
uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+ }
+ else if ( curr->arch.old_guest_table )
+ {
+ XEN_GUEST_HANDLE(void) null;
+
+ ASSERT(rc || i == count);
+ set_xen_guest_handle(null, NULL);
+ /*
+ * In order to have a way to communicate the final return value to
+ * our continuation, we pass this in place of "foreigndom", building
+ * on the fact that this argument isn't needed anymore.
+ */
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmuext_op, "hihi", null,
+ MMU_UPDATE_PREEMPTED, null, rc);
+ }
put_pg_owner(pg_owner);
Index: xen-4.2.1-testing/xen/arch/x86/x86_64/compat/mm.c
===================================================================
--- xen-4.2.1-testing.orig/xen/arch/x86/x86_64/compat/mm.c
+++ xen-4.2.1-testing/xen/arch/x86/x86_64/compat/mm.c
@@ -268,6 +268,13 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mm
int rc = 0;
XEN_GUEST_HANDLE(mmuext_op_t) nat_ops;
+ if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
+ likely(guest_handle_is_null(cmp_uops)) )
+ {
+ set_xen_guest_handle(nat_ops, NULL);
+ return do_mmuext_op(nat_ops, count, pdone, foreigndom);
+ }
+
preempt_mask = count & MMU_UPDATE_PREEMPTED;
count ^= preempt_mask;
@@ -370,12 +377,18 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mm
guest_handle_add_offset(nat_ops, i - left);
guest_handle_subtract_offset(cmp_uops, left);
left = 1;
- BUG_ON(!hypercall_xlat_continuation(&left, 0x01, nat_ops, cmp_uops));
- BUG_ON(left != arg1);
- if (!test_bit(_MCSF_in_multicall, &mcs->flags))
- regs->_ecx += count - i;
+ if ( arg1 != MMU_UPDATE_PREEMPTED )
+ {
+ BUG_ON(!hypercall_xlat_continuation(&left, 0x01, nat_ops,
+ cmp_uops));
+ if ( !test_bit(_MCSF_in_multicall, &mcs->flags) )
+ regs->_ecx += count - i;
+ else
+ mcs->compat_call.args[1] += count - i;
+ }
else
- mcs->compat_call.args[1] += count - i;
+ BUG_ON(hypercall_xlat_continuation(&left, 0));
+ BUG_ON(left != arg1);
}
else
BUG_ON(err > 0);

View File

@ -0,0 +1,257 @@
x86: make page table handling error paths preemptible
... as they may take significant amounts of time.
This requires cloning the tweaked continuation logic from
do_mmuext_op() to do_mmu_update().
Note that in mod_l[34]_entry() a negative "preemptible" value gets
passed to put_page_from_l[34]e() now, telling the callee to store the
respective page in current->arch.old_guest_table (for a hypercall
continuation to pick up), rather than carrying out the put right away.
This is going to be made a little more explicit by a subsequent cleanup
patch.
This is part of CVE-2013-1918 / XSA-45.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
Index: xen-4.2.1-testing/xen/arch/x86/mm.c
===================================================================
--- xen-4.2.1-testing.orig/xen/arch/x86/mm.c
+++ xen-4.2.1-testing/xen/arch/x86/mm.c
@@ -1258,7 +1258,16 @@ static int put_page_from_l3e(l3_pgentry_
#endif
if ( unlikely(partial > 0) )
+ {
+ ASSERT(preemptible >= 0);
return __put_page_type(l3e_get_page(l3e), preemptible);
+ }
+
+ if ( preemptible < 0 )
+ {
+ current->arch.old_guest_table = l3e_get_page(l3e);
+ return 0;
+ }
return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
}
@@ -1271,7 +1280,17 @@ static int put_page_from_l4e(l4_pgentry_
(l4e_get_pfn(l4e) != pfn) )
{
if ( unlikely(partial > 0) )
+ {
+ ASSERT(preemptible >= 0);
return __put_page_type(l4e_get_page(l4e), preemptible);
+ }
+
+ if ( preemptible < 0 )
+ {
+ current->arch.old_guest_table = l4e_get_page(l4e);
+ return 0;
+ }
+
return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
}
return 1;
@@ -1566,12 +1585,17 @@ static int alloc_l3_table(struct page_in
if ( rc < 0 && rc != -EAGAIN && rc != -EINTR )
{
MEM_LOG("Failure in alloc_l3_table: entry %d", i);
+ if ( i )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_pte = 0;
+ current->arch.old_guest_table = page;
+ }
while ( i-- > 0 )
{
if ( !is_guest_l3_slot(i) )
continue;
unadjust_guest_l3e(pl3e[i], d);
- put_page_from_l3e(pl3e[i], pfn, 0, 0);
}
}
@@ -1601,22 +1625,24 @@ static int alloc_l4_table(struct page_in
page->nr_validated_ptes = i;
page->partial_pte = partial ?: 1;
}
- else if ( rc == -EINTR )
+ else if ( rc < 0 )
{
+ if ( rc != -EINTR )
+ MEM_LOG("Failure in alloc_l4_table: entry %d", i);
if ( i )
{
page->nr_validated_ptes = i;
page->partial_pte = 0;
- rc = -EAGAIN;
+ if ( rc == -EINTR )
+ rc = -EAGAIN;
+ else
+ {
+ if ( current->arch.old_guest_table )
+ page->nr_validated_ptes++;
+ current->arch.old_guest_table = page;
+ }
}
}
- else if ( rc < 0 )
- {
- MEM_LOG("Failure in alloc_l4_table: entry %d", i);
- while ( i-- > 0 )
- if ( is_guest_l4_slot(d, i) )
- put_page_from_l4e(pl4e[i], pfn, 0, 0);
- }
if ( rc < 0 )
return rc;
@@ -2064,7 +2090,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
}
- put_page_from_l3e(ol3e, pfn, 0, 0);
+ put_page_from_l3e(ol3e, pfn, 0, -preemptible);
return rc;
}
@@ -2127,7 +2153,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
return -EFAULT;
}
- put_page_from_l4e(ol4e, pfn, 0, 0);
+ put_page_from_l4e(ol4e, pfn, 0, -preemptible);
return rc;
}
@@ -2285,7 +2311,15 @@ static int alloc_page_type(struct page_i
PRtype_info ": caf=%08lx taf=%" PRtype_info,
page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
type, page->count_info, page->u.inuse.type_info);
- page->u.inuse.type_info = 0;
+ if ( page != current->arch.old_guest_table )
+ page->u.inuse.type_info = 0;
+ else
+ {
+ ASSERT((page->u.inuse.type_info &
+ (PGT_count_mask | PGT_validated)) == 1);
+ get_page_light(page);
+ page->u.inuse.type_info |= PGT_partial;
+ }
}
else
{
@@ -3235,21 +3269,17 @@ long do_mmuext_op(
}
if ( (rc = xsm_memory_pin_page(d, pg_owner, page)) != 0 )
- {
- put_page_and_type(page);
okay = 0;
- break;
- }
-
- if ( unlikely(test_and_set_bit(_PGT_pinned,
- &page->u.inuse.type_info)) )
+ else if ( unlikely(test_and_set_bit(_PGT_pinned,
+ &page->u.inuse.type_info)) )
{
MEM_LOG("Mfn %lx already pinned", page_to_mfn(page));
- put_page_and_type(page);
okay = 0;
- break;
}
+ if ( unlikely(!okay) )
+ goto pin_drop;
+
/* A page is dirtied when its pin status is set. */
paging_mark_dirty(pg_owner, page_to_mfn(page));
@@ -3263,7 +3293,13 @@ long do_mmuext_op(
&page->u.inuse.type_info));
spin_unlock(&pg_owner->page_alloc_lock);
if ( drop_ref )
- put_page_and_type(page);
+ {
+ pin_drop:
+ if ( type == PGT_l1_page_table )
+ put_page_and_type(page);
+ else
+ curr->arch.old_guest_table = page;
+ }
}
break;
@@ -3669,11 +3705,28 @@ long do_mmu_update(
void *va;
unsigned long gpfn, gmfn, mfn;
struct page_info *page;
- int rc = 0, i = 0;
- unsigned int cmd, done = 0, pt_dom;
- struct vcpu *v = current;
+ unsigned int cmd, i = 0, done = 0, pt_dom;
+ struct vcpu *curr = current, *v = curr;
struct domain *d = v->domain, *pt_owner = d, *pg_owner;
struct domain_mmap_cache mapcache;
+ int rc = put_old_guest_table(curr);
+
+ if ( unlikely(rc) )
+ {
+ if ( likely(rc == -EAGAIN) )
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmu_update, "hihi", ureqs, count, pdone,
+ foreigndom);
+ return rc;
+ }
+
+ if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
+ likely(guest_handle_is_null(ureqs)) )
+ {
+ /* See the curr->arch.old_guest_table related
+ * hypercall_create_continuation() below. */
+ return (int)foreigndom;
+ }
if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
{
@@ -3722,7 +3775,7 @@ long do_mmu_update(
for ( i = 0; i < count; i++ )
{
- if ( hypercall_preempt_check() )
+ if ( curr->arch.old_guest_table || hypercall_preempt_check() )
{
rc = -EAGAIN;
break;
@@ -3903,9 +3956,27 @@ long do_mmu_update(
}
if ( rc == -EAGAIN )
+ {
+ ASSERT(i < count);
rc = hypercall_create_continuation(
__HYPERVISOR_mmu_update, "hihi",
ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+ }
+ else if ( curr->arch.old_guest_table )
+ {
+ XEN_GUEST_HANDLE(void) null;
+
+ ASSERT(rc || i == count);
+ set_xen_guest_handle(null, NULL);
+ /*
+ * In order to have a way to communicate the final return value to
+ * our continuation, we pass this in place of "foreigndom", building
+ * on the fact that this argument isn't needed anymore.
+ */
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmu_update, "hihi", null,
+ MMU_UPDATE_PREEMPTED, null, rc);
+ }
put_pg_owner(pg_owner);

View File

@ -0,0 +1,406 @@
x86: cleanup after making various page table manipulation operations preemptible
This drops the "preemptible" parameters from various functions where
now they can't (or shouldn't, validated by assertions) be run in non-
preemptible mode anymore, to prove that manipulations of at least L3
and L4 page tables and page table entries are now always preemptible,
i.e. the earlier patches actually fulfill their purpose of fixing the
resulting security issue.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1986,7 +1986,7 @@ static int relinquish_memory(
}
if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
- ret = put_page_and_type_preemptible(page, 1);
+ ret = put_page_and_type_preemptible(page);
switch ( ret )
{
case 0:
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -1044,7 +1044,7 @@ get_page_from_l2e(
define_get_linear_pagetable(l3);
static int
get_page_from_l3e(
- l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int preemptible)
+ l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial)
{
int rc;
@@ -1058,7 +1058,7 @@ get_page_from_l3e(
}
rc = get_page_and_type_from_pagenr(
- l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible);
+ l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, 1);
if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) )
rc = 0;
@@ -1069,7 +1069,7 @@ get_page_from_l3e(
define_get_linear_pagetable(l4);
static int
get_page_from_l4e(
- l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int preemptible)
+ l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial)
{
int rc;
@@ -1083,7 +1083,7 @@ get_page_from_l4e(
}
rc = get_page_and_type_from_pagenr(
- l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible);
+ l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, 1);
if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
rc = 0;
@@ -1237,8 +1237,10 @@ static int put_page_from_l2e(l2_pgentry_
static int __put_page_type(struct page_info *, int preemptible);
static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
- int partial, int preemptible)
+ int partial, bool_t defer)
{
+ struct page_info *pg;
+
if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
return 1;
@@ -1257,41 +1259,45 @@ static int put_page_from_l3e(l3_pgentry_
}
#endif
+ pg = l3e_get_page(l3e);
+
if ( unlikely(partial > 0) )
{
- ASSERT(preemptible >= 0);
- return __put_page_type(l3e_get_page(l3e), preemptible);
+ ASSERT(!defer);
+ return __put_page_type(pg, 1);
}
- if ( preemptible < 0 )
+ if ( defer )
{
- current->arch.old_guest_table = l3e_get_page(l3e);
+ current->arch.old_guest_table = pg;
return 0;
}
- return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
+ return put_page_and_type_preemptible(pg);
}
#if CONFIG_PAGING_LEVELS >= 4
static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
- int partial, int preemptible)
+ int partial, bool_t defer)
{
if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
(l4e_get_pfn(l4e) != pfn) )
{
+ struct page_info *pg = l4e_get_page(l4e);
+
if ( unlikely(partial > 0) )
{
- ASSERT(preemptible >= 0);
- return __put_page_type(l4e_get_page(l4e), preemptible);
+ ASSERT(!defer);
+ return __put_page_type(pg, 1);
}
- if ( preemptible < 0 )
+ if ( defer )
{
- current->arch.old_guest_table = l4e_get_page(l4e);
+ current->arch.old_guest_table = pg;
return 0;
}
- return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
+ return put_page_and_type_preemptible(pg);
}
return 1;
}
@@ -1509,7 +1515,7 @@ static int alloc_l2_table(struct page_in
return rc > 0 ? 0 : rc;
}
-static int alloc_l3_table(struct page_info *page, int preemptible)
+static int alloc_l3_table(struct page_info *page)
{
struct domain *d = page_get_owner(page);
unsigned long pfn = page_to_mfn(page);
@@ -1556,11 +1562,10 @@ static int alloc_l3_table(struct page_in
rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
PGT_l2_page_table |
PGT_pae_xen_l2,
- d, partial, preemptible);
+ d, partial, 1);
}
else if ( !is_guest_l3_slot(i) ||
- (rc = get_page_from_l3e(pl3e[i], pfn, d,
- partial, preemptible)) > 0 )
+ (rc = get_page_from_l3e(pl3e[i], pfn, d, partial)) > 0 )
continue;
if ( rc == -EAGAIN )
@@ -1604,7 +1609,7 @@ static int alloc_l3_table(struct page_in
}
#if CONFIG_PAGING_LEVELS >= 4
-static int alloc_l4_table(struct page_info *page, int preemptible)
+static int alloc_l4_table(struct page_info *page)
{
struct domain *d = page_get_owner(page);
unsigned long pfn = page_to_mfn(page);
@@ -1616,8 +1621,7 @@ static int alloc_l4_table(struct page_in
i++, partial = 0 )
{
if ( !is_guest_l4_slot(d, i) ||
- (rc = get_page_from_l4e(pl4e[i], pfn, d,
- partial, preemptible)) > 0 )
+ (rc = get_page_from_l4e(pl4e[i], pfn, d, partial)) > 0 )
continue;
if ( rc == -EAGAIN )
@@ -1662,7 +1666,7 @@ static int alloc_l4_table(struct page_in
return rc > 0 ? 0 : rc;
}
#else
-#define alloc_l4_table(page, preemptible) (-EINVAL)
+#define alloc_l4_table(page) (-EINVAL)
#endif
@@ -1714,7 +1718,7 @@ static int free_l2_table(struct page_inf
return err;
}
-static int free_l3_table(struct page_info *page, int preemptible)
+static int free_l3_table(struct page_info *page)
{
struct domain *d = page_get_owner(page);
unsigned long pfn = page_to_mfn(page);
@@ -1727,7 +1731,7 @@ static int free_l3_table(struct page_inf
do {
if ( is_guest_l3_slot(i) )
{
- rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible);
+ rc = put_page_from_l3e(pl3e[i], pfn, partial, 0);
if ( rc < 0 )
break;
partial = 0;
@@ -1754,7 +1758,7 @@ static int free_l3_table(struct page_inf
}
#if CONFIG_PAGING_LEVELS >= 4
-static int free_l4_table(struct page_info *page, int preemptible)
+static int free_l4_table(struct page_info *page)
{
struct domain *d = page_get_owner(page);
unsigned long pfn = page_to_mfn(page);
@@ -1764,7 +1768,7 @@ static int free_l4_table(struct page_inf
do {
if ( is_guest_l4_slot(d, i) )
- rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible);
+ rc = put_page_from_l4e(pl4e[i], pfn, partial, 0);
if ( rc < 0 )
break;
partial = 0;
@@ -1784,7 +1788,7 @@ static int free_l4_table(struct page_inf
return rc > 0 ? 0 : rc;
}
#else
-#define free_l4_table(page, preemptible) (-EINVAL)
+#define free_l4_table(page) (-EINVAL)
#endif
int page_lock(struct page_info *page)
@@ -2023,7 +2027,6 @@ static int mod_l3_entry(l3_pgentry_t *pl
l3_pgentry_t nl3e,
unsigned long pfn,
int preserve_ad,
- int preemptible,
struct vcpu *vcpu)
{
l3_pgentry_t ol3e;
@@ -2063,7 +2066,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
return rc ? 0 : -EFAULT;
}
- rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible);
+ rc = get_page_from_l3e(nl3e, pfn, d, 0);
if ( unlikely(rc < 0) )
return rc;
rc = 0;
@@ -2090,7 +2093,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
}
- put_page_from_l3e(ol3e, pfn, 0, -preemptible);
+ put_page_from_l3e(ol3e, pfn, 0, 1);
return rc;
}
@@ -2101,7 +2104,6 @@ static int mod_l4_entry(l4_pgentry_t *pl
l4_pgentry_t nl4e,
unsigned long pfn,
int preserve_ad,
- int preemptible,
struct vcpu *vcpu)
{
struct domain *d = vcpu->domain;
@@ -2134,7 +2136,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
return rc ? 0 : -EFAULT;
}
- rc = get_page_from_l4e(nl4e, pfn, d, 0, preemptible);
+ rc = get_page_from_l4e(nl4e, pfn, d, 0);
if ( unlikely(rc < 0) )
return rc;
rc = 0;
@@ -2153,7 +2155,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
return -EFAULT;
}
- put_page_from_l4e(ol4e, pfn, 0, -preemptible);
+ put_page_from_l4e(ol4e, pfn, 0, 1);
return rc;
}
@@ -2275,10 +2277,12 @@ static int alloc_page_type(struct page_i
rc = alloc_l2_table(page, type, preemptible);
break;
case PGT_l3_page_table:
- rc = alloc_l3_table(page, preemptible);
+ ASSERT(preemptible);
+ rc = alloc_l3_table(page);
break;
case PGT_l4_page_table:
- rc = alloc_l4_table(page, preemptible);
+ ASSERT(preemptible);
+ rc = alloc_l4_table(page);
break;
case PGT_seg_desc_page:
rc = alloc_segdesc_page(page);
@@ -2372,10 +2376,12 @@ int free_page_type(struct page_info *pag
if ( !(type & PGT_partial) )
page->nr_validated_ptes = L3_PAGETABLE_ENTRIES;
#endif
- rc = free_l3_table(page, preemptible);
+ ASSERT(preemptible);
+ rc = free_l3_table(page);
break;
case PGT_l4_page_table:
- rc = free_l4_table(page, preemptible);
+ ASSERT(preemptible);
+ rc = free_l4_table(page);
break;
default:
MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page));
@@ -2866,7 +2872,7 @@ static int put_old_guest_table(struct vc
if ( !v->arch.old_guest_table )
return 0;
- switch ( rc = put_page_and_type_preemptible(v->arch.old_guest_table, 1) )
+ switch ( rc = put_page_and_type_preemptible(v->arch.old_guest_table) )
{
case -EINTR:
case -EAGAIN:
@@ -2898,7 +2904,7 @@ int vcpu_destroy_pagetables(struct vcpu
if ( paging_mode_refcounts(v->domain) )
put_page(page);
else
- rc = put_page_and_type_preemptible(page, 1);
+ rc = put_page_and_type_preemptible(page);
}
#ifdef __x86_64__
@@ -2924,7 +2930,7 @@ int vcpu_destroy_pagetables(struct vcpu
if ( paging_mode_refcounts(v->domain) )
put_page(page);
else
- rc = put_page_and_type_preemptible(page, 1);
+ rc = put_page_and_type_preemptible(page);
}
if ( !rc )
v->arch.guest_table_user = pagetable_null();
@@ -2953,7 +2959,7 @@ int new_guest_cr3(unsigned long mfn)
l4e_from_pfn(
mfn,
(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
- pagetable_get_pfn(curr->arch.guest_table), 0, 1, curr);
+ pagetable_get_pfn(curr->arch.guest_table), 0, curr);
switch ( rc )
{
case 0:
@@ -3016,7 +3022,7 @@ int new_guest_cr3(unsigned long mfn)
if ( paging_mode_refcounts(d) )
put_page(page);
else
- switch ( rc = put_page_and_type_preemptible(page, 1) )
+ switch ( rc = put_page_and_type_preemptible(page) )
{
case -EINTR:
rc = -EAGAIN;
@@ -3327,7 +3333,7 @@ long do_mmuext_op(
break;
}
- switch ( rc = put_page_and_type_preemptible(page, 1) )
+ switch ( rc = put_page_and_type_preemptible(page) )
{
case -EINTR:
case -EAGAIN:
@@ -3405,7 +3411,7 @@ long do_mmuext_op(
if ( paging_mode_refcounts(d) )
put_page(page);
else
- switch ( rc = put_page_and_type_preemptible(page, 1) )
+ switch ( rc = put_page_and_type_preemptible(page) )
{
case -EINTR:
rc = -EAGAIN;
@@ -3882,12 +3888,12 @@ long do_mmu_update(
break;
case PGT_l3_page_table:
rc = mod_l3_entry(va, l3e_from_intpte(req.val), mfn,
- cmd == MMU_PT_UPDATE_PRESERVE_AD, 1, v);
+ cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
break;
#if CONFIG_PAGING_LEVELS >= 4
case PGT_l4_page_table:
rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn,
- cmd == MMU_PT_UPDATE_PRESERVE_AD, 1, v);
+ cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
break;
#endif
case PGT_writable_page:
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -384,15 +384,10 @@ static inline void put_page_and_type(str
put_page(page);
}
-static inline int put_page_and_type_preemptible(struct page_info *page,
- int preemptible)
+static inline int put_page_and_type_preemptible(struct page_info *page)
{
- int rc = 0;
+ int rc = put_page_type_preemptible(page);
- if ( preemptible )
- rc = put_page_type_preemptible(page);
- else
- put_page_type(page);
if ( likely(rc == 0) )
put_page(page);
return rc;

112
CVE-2013-1922-xsa48.patch Normal file
View File

@ -0,0 +1,112 @@
References: bnc#81???? CVE-2013-1922 XSA-48
Add -f FMT / --format FMT arg to qemu-nbd
From: "Daniel P. Berrange" <berrange@redhat.com>
Currently the qemu-nbd program will auto-detect the format of
any disk it is given. This behaviour is known to be insecure.
For example, if qemu-nbd initially exposes a 'raw' file to an
unprivileged app, and that app runs
'qemu-img create -f qcow2 -o backing_file=/etc/shadow /dev/nbd0'
then the next time the app is started, the qemu-nbd will now
detect it as a 'qcow2' file and expose /etc/shadow to the
unprivileged app.
The only way to avoid this is to explicitly tell qemu-nbd what
disk format to use on the command line, completely disabling
auto-detection. This patch adds a '-f' / '--format' arg for
this purpose, mirroring what is already available via qemu-img
and qemu commands.
qemu-nbd --format raw -p 9000 evil.img
will now always use raw, regardless of what format 'evil.img'
looks like it contains
Signed-off-by: Daniel P. Berrange <berrange@redhat.com>
[Use errx, not err. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
[ This is a security issue, CVE-2013-1922 / XSA-48. ]
--- a/tools/qemu-xen-dir-remote/qemu-nbd.c
+++ b/tools/qemu-xen-dir-remote/qemu-nbd.c
@@ -247,6 +247,7 @@ out:
int main(int argc, char **argv)
{
BlockDriverState *bs;
+ BlockDriver *drv;
off_t dev_offset = 0;
off_t offset = 0;
uint32_t nbdflags = 0;
@@ -256,7 +257,7 @@ int main(int argc, char **argv)
struct sockaddr_in addr;
socklen_t addr_len = sizeof(addr);
off_t fd_size;
- const char *sopt = "hVb:o:p:rsnP:c:dvk:e:t";
+ const char *sopt = "hVb:o:p:rsnP:c:dvk:e:f:t";
struct option lopt[] = {
{ "help", 0, NULL, 'h' },
{ "version", 0, NULL, 'V' },
@@ -271,6 +272,7 @@ int main(int argc, char **argv)
{ "snapshot", 0, NULL, 's' },
{ "nocache", 0, NULL, 'n' },
{ "shared", 1, NULL, 'e' },
+ { "format", 1, NULL, 'f' },
{ "persistent", 0, NULL, 't' },
{ "verbose", 0, NULL, 'v' },
{ NULL, 0, NULL, 0 }
@@ -292,6 +294,7 @@ int main(int argc, char **argv)
int max_fd;
int persistent = 0;
pthread_t client_thread;
+ const char *fmt = NULL;
/* The client thread uses SIGTERM to interrupt the server. A signal
* handler ensures that "qemu-nbd -v -c" exits with a nice status code.
@@ -368,6 +371,9 @@ int main(int argc, char **argv)
errx(EXIT_FAILURE, "Shared device number must be greater than 0\n");
}
break;
+ case 'f':
+ fmt = optarg;
+ break;
case 't':
persistent = 1;
break;
@@ -478,9 +484,19 @@ int main(int argc, char **argv)
bdrv_init();
atexit(bdrv_close_all);
+ if (fmt) {
+ drv = bdrv_find_format(fmt);
+ if (!drv) {
+ errx(EXIT_FAILURE, "Unknown file format '%s'", fmt);
+ }
+ } else {
+ drv = NULL;
+ }
+
bs = bdrv_new("hda");
srcpath = argv[optind];
- if ((ret = bdrv_open(bs, srcpath, flags, NULL)) < 0) {
+ ret = bdrv_open(bs, srcpath, flags, drv);
+ if (ret < 0) {
errno = -ret;
err(EXIT_FAILURE, "Failed to bdrv_open '%s'", argv[optind]);
}
--- a/tools/qemu-xen-dir-remote/qemu-nbd.texi
+++ b/tools/qemu-xen-dir-remote/qemu-nbd.texi
@@ -36,6 +36,8 @@ Export Qemu disk image using NBD protoco
disconnect the specified device
@item -e, --shared=@var{num}
device can be shared by @var{num} clients (default @samp{1})
+@item -f, --format=@var{fmt}
+ force block driver for format @var{fmt} instead of auto-detecting
@item -t, --persistent
don't exit on the last connection
@item -v, --verbose

Some files were not shown because too many files have changed in this diff Show More