Accepting request 79890 from Virtualization

Checkin for Milestone 5

OBS-URL: https://build.opensuse.org/request/show/79890
OBS-URL: https://build.opensuse.org/package/show/openSUSE:Factory/xen?expand=0&rev=145
This commit is contained in:
Sascha Peilicke 2011-08-29 07:40:21 +00:00 committed by Git OBS Bridge
commit 4e52c6689a
36 changed files with 3070 additions and 55 deletions

View File

@ -0,0 +1,65 @@
# HG changeset patch
# User Stefano Stabellini <stefano.stabellini@eu.citrix.com>
# Date 1310654989 -3600
# Node ID 5239811f92e1ffb185a50172fdcf47372e71ba7e
# Parent 98701b1276c034b2bbbc8c7a975cf4c361caaa63
libxl: Fix segfault in get_all_assigned_devices
pcidevs is an array of ndev elements (ndev is the number of pci devices
assigend to a specific domain), but we access pcidevs + *num
where *num is the global number of pci devices assigned so far to all
domains in the system.
Fix the issue removing pcidevs and just realloc'ing *list every time we
want to add a new pci device to the array.
Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Committed-by: Ian Jackson <ian.jackson@eu.citrix.com>
Index: xen-4.1.1-testing/tools/libxl/libxl_pci.c
===================================================================
--- xen-4.1.1-testing.orig/tools/libxl/libxl_pci.c
+++ xen-4.1.1-testing/tools/libxl/libxl_pci.c
@@ -434,7 +434,6 @@ retry_transaction2:
static int get_all_assigned_devices(libxl__gc *gc, libxl_device_pci **list, int *num)
{
- libxl_device_pci *pcidevs = NULL;
char **domlist;
unsigned int nd = 0, i;
@@ -451,8 +450,7 @@ static int get_all_assigned_devices(libx
int ndev = atoi(num_devs), j;
char *devpath, *bdf;
- pcidevs = libxl__calloc(gc, sizeof(*pcidevs), ndev);
- for(j = (pcidevs) ? 0 : ndev; j < ndev; j++) {
+ for(j = 0; j < ndev; j++) {
devpath = libxl__sprintf(gc, "/local/domain/0/backend/pci/%s/0/dev-%u",
domlist[i], j);
bdf = libxl__xs_read(gc, XBT_NULL, devpath);
@@ -461,19 +459,16 @@ static int get_all_assigned_devices(libx
if ( sscanf(bdf, PCI_BDF, &dom, &bus, &dev, &func) != 4 )
continue;
- pcidev_init(pcidevs + *num, dom, bus, dev, func, 0);
+ *list = realloc(*list, sizeof(libxl_device_pci) * ((*num) + 1));
+ if (*list == NULL)
+ return ERROR_NOMEM;
+ pcidev_init(*list + *num, dom, bus, dev, func, 0);
(*num)++;
}
}
}
}
-
- if ( 0 == *num ) {
- free(pcidevs);
- pcidevs = NULL;
- }else{
- *list = pcidevs;
- }
+ libxl__ptr_add(gc, *list);
return 0;
}

174
23725-pci-add-device.patch Normal file
View File

@ -0,0 +1,174 @@
# HG changeset patch
# User Jan Beulich <jbeulich@novell.com>
# Date 1311081248 -3600
# Node ID 4dc6a9ba90d60fdf0cc0898fc9a8fe84ae9030fc
# Parent b3434f24b0827c5ef34e4b4a72893288e2ffbe40
PCI: consolidate interface for adding devices
The functionality of pci_add_device_ext() can be easily folded into
pci_add_device(), and eliminates the need to change two functions for
future adjustments.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
--- a/xen/arch/ia64/xen/hypercall.c
+++ b/xen/arch/ia64/xen/hypercall.c
@@ -662,8 +662,8 @@ long do_physdev_op(int cmd, XEN_GUEST_HA
if ( copy_from_guest(&manage_pci, arg, 1) != 0 )
break;
- ret = pci_add_device(manage_pci.bus, manage_pci.devfn);
- break;
+ ret = pci_add_device(manage_pci.bus, manage_pci.devfn, NULL);
+ break;
}
case PHYSDEVOP_manage_pci_remove: {
@@ -695,10 +695,10 @@ long do_physdev_op(int cmd, XEN_GUEST_HA
pdev_info.is_virtfn = manage_pci_ext.is_virtfn;
pdev_info.physfn.bus = manage_pci_ext.physfn.bus;
pdev_info.physfn.devfn = manage_pci_ext.physfn.devfn;
- ret = pci_add_device_ext(manage_pci_ext.bus,
- manage_pci_ext.devfn,
- &pdev_info);
- break;
+ ret = pci_add_device(manage_pci_ext.bus,
+ manage_pci_ext.devfn,
+ &pdev_info);
+ break;
}
default:
--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
@@ -472,7 +472,7 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
if ( copy_from_guest(&manage_pci, arg, 1) != 0 )
break;
- ret = pci_add_device(manage_pci.bus, manage_pci.devfn);
+ ret = pci_add_device(manage_pci.bus, manage_pci.devfn, NULL);
break;
}
@@ -509,9 +509,9 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
pdev_info.is_virtfn = manage_pci_ext.is_virtfn;
pdev_info.physfn.bus = manage_pci_ext.physfn.bus;
pdev_info.physfn.devfn = manage_pci_ext.physfn.devfn;
- ret = pci_add_device_ext(manage_pci_ext.bus,
- manage_pci_ext.devfn,
- &pdev_info);
+ ret = pci_add_device(manage_pci_ext.bus,
+ manage_pci_ext.devfn,
+ &pdev_info);
break;
}
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -142,16 +142,29 @@ void pci_enable_acs(struct pci_dev *pdev
pci_conf_write16(bus, dev, func, pos + PCI_ACS_CTRL, ctrl);
}
-int pci_add_device(u8 bus, u8 devfn)
+int pci_add_device(u8 bus, u8 devfn, const struct pci_dev_info *info)
{
struct pci_dev *pdev;
+ const char *pdev_type;
int ret = -ENOMEM;
+ if (!info)
+ pdev_type = "device";
+ else if (info->is_extfn)
+ pdev_type = "extended function";
+ else if (info->is_virtfn)
+ pdev_type = "virtual function";
+ else
+ return -EINVAL;
+
spin_lock(&pcidevs_lock);
pdev = alloc_pdev(bus, devfn);
if ( !pdev )
goto out;
+ if ( info )
+ pdev->info = *info;
+
ret = 0;
if ( !pdev->domain )
{
@@ -169,8 +182,8 @@ int pci_add_device(u8 bus, u8 devfn)
out:
spin_unlock(&pcidevs_lock);
- printk(XENLOG_DEBUG "PCI add device %02x:%02x.%x\n", bus,
- PCI_SLOT(devfn), PCI_FUNC(devfn));
+ printk(XENLOG_DEBUG "PCI add %s %02x:%02x.%x\n", pdev_type,
+ bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
return ret;
}
@@ -197,51 +210,6 @@ int pci_remove_device(u8 bus, u8 devfn)
return ret;
}
-int pci_add_device_ext(u8 bus, u8 devfn, struct pci_dev_info *info)
-{
- int ret;
- char *pdev_type;
- struct pci_dev *pdev;
-
- if (info->is_extfn)
- pdev_type = "Extended Function";
- else if (info->is_virtfn)
- pdev_type = "Virtual Function";
- else
- return -EINVAL;
-
-
- ret = -ENOMEM;
- spin_lock(&pcidevs_lock);
- pdev = alloc_pdev(bus, devfn);
- if ( !pdev )
- goto out;
-
- pdev->info = *info;
-
- ret = 0;
- if ( !pdev->domain )
- {
- pdev->domain = dom0;
- ret = iommu_add_device(pdev);
- if ( ret )
- {
- pdev->domain = NULL;
- goto out;
- }
-
- list_add(&pdev->domain_list, &dom0->arch.pdev_list);
- pci_enable_acs(pdev);
- }
-
-out:
- spin_unlock(&pcidevs_lock);
- printk(XENLOG_DEBUG "PCI add %s %02x:%02x.%x\n", pdev_type,
- bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
-
- return ret;
-}
-
static void pci_clean_dpci_irqs(struct domain *d)
{
struct hvm_irq_dpci *hvm_irq_dpci = NULL;
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -86,9 +86,8 @@ struct pci_dev *pci_lock_pdev(int bus, i
struct pci_dev *pci_lock_domain_pdev(struct domain *d, int bus, int devfn);
void pci_release_devices(struct domain *d);
-int pci_add_device(u8 bus, u8 devfn);
+int pci_add_device(u8 bus, u8 devfn, const struct pci_dev_info *);
int pci_remove_device(u8 bus, u8 devfn);
-int pci_add_device_ext(u8 bus, u8 devfn, struct pci_dev_info *info);
struct pci_dev *pci_get_pdev(int bus, int devfn);
struct pci_dev *pci_get_pdev_by_domain(struct domain *d, int bus, int devfn);

60
23732-sedf.patch Normal file
View File

@ -0,0 +1,60 @@
# HG changeset patch
# User George Dunlap <george.dunlap@eu.citrix.com>
# Date 1311255331 -3600
# Node ID 3795d79c740b2aa50aacb7bf7e3503862a7b436c
# Parent 48f72b389b04cfa8d44924577a69ed59e48fbe77
xen: Fix sedf scheduler
Update the sedf scheduler to be compatible with the most recent
generic scheduler interface changes.
Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
--- a/xen/common/sched_sedf.c
+++ b/xen/common/sched_sedf.c
@@ -331,6 +331,19 @@ static inline void __add_to_runqueue_sor
}
+static void sedf_insert_vcpu(const struct scheduler *ops, struct vcpu *v)
+{
+ if ( !is_idle_vcpu(v) )
+ {
+ extraq_check(v);
+ }
+ else
+ {
+ EDOM_INFO(v)->deadl_abs = 0;
+ EDOM_INFO(v)->status &= ~SEDF_ASLEEP;
+ }
+}
+
static void *sedf_alloc_vdata(const struct scheduler *ops, struct vcpu *v, void *dd)
{
struct sedf_vcpu_info *inf;
@@ -365,16 +378,6 @@ static void *sedf_alloc_vdata(const stru
INIT_LIST_HEAD(&(inf->list));
INIT_LIST_HEAD(&(inf->extralist[EXTRA_PEN_Q]));
INIT_LIST_HEAD(&(inf->extralist[EXTRA_UTIL_Q]));
-
- if ( !is_idle_vcpu(v) )
- {
- extraq_check(v);
- }
- else
- {
- inf->deadl_abs = 0;
- inf->status &= ~SEDF_ASLEEP;
- }
return inf;
}
@@ -1498,6 +1501,8 @@ const struct scheduler sched_sedf_def =
.init_domain = sedf_init_domain,
.destroy_domain = sedf_destroy_domain,
+ .insert_vcpu = sedf_insert_vcpu,
+
.alloc_vdata = sedf_alloc_vdata,
.free_vdata = sedf_free_vdata,
.alloc_pdata = sedf_alloc_pdata,

234
23735-guest-dom0-cap.patch Normal file
View File

@ -0,0 +1,234 @@
References: bnc#702407
# HG changeset patch
# User Jan Beulich <jbeulich@novell.com>
# Date 1311407355 -3600
# Node ID 537918f518eec3d8e2e2dad403fce40303321523
# Parent 42edf1481c5704c8ce1eb171a713b5411df0551a
add privileged (dom0) kernel feature indication
With our switching away from supporting 32-bit Dom0 operation, users
complained that attempts (perhaps due to lack of knowledge of that
change) to boot the no longer privileged kernel in Dom0 resulted in
apparently silent failure. To make the mismatch explicit and visible,
add dom0 feature flag that the kernel can set to indicate operation as
dom0 is supported.
Due to the way elf_xen_parse_features() worked up to now (getting
fixed here), adding features indications to the old, string based ELF
note would make the respective kernel unusable on older hypervisors.
For that reason, a new ELF Note is being introduced that allows
specifying supported features as a bit array instead (with features
unknown to the hypervisor simply ignored, as now also done by
elf_xen_parse_features(), whereas here unknown kernel-required
features still keep the kernel [and hence VM] from booting).
Introduce and use elf_note_numeric_array() to be forward
compatible (or else an old hypervisor wouldn't be able to parse kernel
specified features occupying more than 64 bits - thanks, Ian!).
Signed-off-by: Jan Beulich <jbeulich@novell.com>
# HG changeset patch
# User Jan Beulich <jbeulich@novell.com>
# Date 1311598088 -3600
# Node ID 50ddc200a60cad3929a79a992f09145fd39af49d
# Parent d8725d9fb8657874011d2f2772f5e970b24dfe9b
fix regression from c/s 23735:537918f518ee
This was checking presence of the wrong (old) ELF note. I don't really
understand how this failed consistently only for one of the xen-boot
tests...
Signed-off-by: Jan Beulich <jbeulich@novell.com>
--- a/tools/libxc/xc_dom_elfloader.c
+++ b/tools/libxc/xc_dom_elfloader.c
@@ -286,6 +286,13 @@ static int xc_dom_parse_elf_kernel(struc
if ( (rc = elf_xen_parse(elf, &dom->parms)) != 0 )
return rc;
+ if ( elf_xen_feature_get(XENFEAT_dom0, dom->parms.f_required) )
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: Kernel does not"
+ " support unprivileged (DomU) operation", __FUNCTION__);
+ return -EINVAL;
+ }
+
/* find kernel segment */
dom->kernel_seg.vstart = dom->parms.virt_kstart;
dom->kernel_seg.vend = dom->parms.virt_kend;
--- a/xen/arch/ia64/xen/domain.c
+++ b/xen/arch/ia64/xen/domain.c
@@ -2164,6 +2164,13 @@ int __init construct_dom0(struct domain
return -1;
}
+ if (parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != XEN_ENT_NONE &&
+ !test_bit(XENFEAT_dom0, parms.f_supported))
+ {
+ printk("Kernel does not support Dom0 operation\n");
+ return -1;
+ }
+
p_start = parms.virt_base;
pkern_start = parms.virt_kstart;
pkern_end = parms.virt_kend;
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -415,6 +415,13 @@ int __init construct_dom0(
return -EINVAL;
}
+ if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != XEN_ENT_NONE &&
+ !test_bit(XENFEAT_dom0, parms.f_supported) )
+ {
+ printk("Kernel does not support Dom0 operation\n");
+ return -EINVAL;
+ }
+
#if defined(__x86_64__)
if ( compat32 )
{
--- a/xen/common/kernel.c
+++ b/xen/common/kernel.c
@@ -287,6 +287,8 @@ DO(xen_version)(int cmd, XEN_GUEST_HANDL
(1U << XENFEAT_auto_translated_physmap);
if ( supervisor_mode_kernel )
fi.submap |= 1U << XENFEAT_supervisor_mode_kernel;
+ if ( current->domain == dom0 )
+ fi.submap |= 1U << XENFEAT_dom0;
#ifdef CONFIG_X86
if ( !is_hvm_vcpu(current) )
fi.submap |= (1U << XENFEAT_mmu_pt_update_preserve_ad) |
--- a/xen/common/libelf/libelf-dominfo.c
+++ b/xen/common/libelf/libelf-dominfo.c
@@ -26,7 +26,8 @@ static const char *const elf_xen_feature
[XENFEAT_writable_descriptor_tables] = "writable_descriptor_tables",
[XENFEAT_auto_translated_physmap] = "auto_translated_physmap",
[XENFEAT_supervisor_mode_kernel] = "supervisor_mode_kernel",
- [XENFEAT_pae_pgdir_above_4gb] = "pae_pgdir_above_4gb"
+ [XENFEAT_pae_pgdir_above_4gb] = "pae_pgdir_above_4gb",
+ [XENFEAT_dom0] = "dom0"
};
static const int elf_xen_features =
sizeof(elf_xen_feature_names) / sizeof(elf_xen_feature_names[0]);
@@ -82,7 +83,7 @@ int elf_xen_parse_features(const char *f
}
}
}
- if ( i == elf_xen_features )
+ if ( i == elf_xen_features && required && feature[0] == '!' )
return -1;
}
@@ -113,6 +114,7 @@ int elf_xen_parse_note(struct elf_binary
[XEN_ELFNOTE_LOADER] = { "LOADER", 1},
[XEN_ELFNOTE_PAE_MODE] = { "PAE_MODE", 1},
[XEN_ELFNOTE_FEATURES] = { "FEATURES", 1},
+ [XEN_ELFNOTE_SUPPORTED_FEATURES] = { "SUPPORTED_FEATURES", 0},
[XEN_ELFNOTE_BSD_SYMTAB] = { "BSD_SYMTAB", 1},
[XEN_ELFNOTE_SUSPEND_CANCEL] = { "SUSPEND_CANCEL", 0 },
[XEN_ELFNOTE_MOD_START_PFN] = { "MOD_START_PFN", 0 },
@@ -121,6 +123,7 @@ int elf_xen_parse_note(struct elf_binary
const char *str = NULL;
uint64_t val = 0;
+ unsigned int i;
int type = elf_uval(elf, note, type);
if ( (type >= sizeof(note_desc) / sizeof(note_desc[0])) ||
@@ -199,6 +202,12 @@ int elf_xen_parse_note(struct elf_binary
return -1;
break;
+ case XEN_ELFNOTE_SUPPORTED_FEATURES:
+ for ( i = 0; i < XENFEAT_NR_SUBMAPS; ++i )
+ parms->f_supported[i] |= elf_note_numeric_array(
+ elf, note, sizeof(*parms->f_supported), i);
+ break;
+
}
return 0;
}
--- a/xen/common/libelf/libelf-tools.c
+++ b/xen/common/libelf/libelf-tools.c
@@ -227,6 +227,27 @@ uint64_t elf_note_numeric(struct elf_bin
return 0;
}
}
+
+uint64_t elf_note_numeric_array(struct elf_binary *elf, const elf_note *note,
+ unsigned int unitsz, unsigned int idx)
+{
+ const void *desc = elf_note_desc(elf, note);
+ int descsz = elf_uval(elf, note, descsz);
+
+ if ( descsz % unitsz || idx >= descsz / unitsz )
+ return 0;
+ switch (unitsz)
+ {
+ case 1:
+ case 2:
+ case 4:
+ case 8:
+ return elf_access_unsigned(elf, desc, idx * unitsz, unitsz);
+ default:
+ return 0;
+ }
+}
+
const elf_note *elf_note_next(struct elf_binary *elf, const elf_note * note)
{
int namesz = (elf_uval(elf, note, namesz) + 3) & ~3;
--- a/xen/include/public/elfnote.h
+++ b/xen/include/public/elfnote.h
@@ -179,9 +179,22 @@
#define XEN_ELFNOTE_MOD_START_PFN 16
/*
+ * The features supported by this kernel (numeric).
+ *
+ * Other than XEN_ELFNOTE_FEATURES on pre-4.2 Xen, this note allows a
+ * kernel to specify support for features that older hypervisors don't
+ * know about. The set of features 4.2 and newer hypervisors will
+ * consider supported by the kernel is the combination of the sets
+ * specified through this and the string note.
+ *
+ * LEGACY: FEATURES
+ */
+#define XEN_ELFNOTE_SUPPORTED_FEATURES 17
+
+/*
* The number of the highest elfnote defined.
*/
-#define XEN_ELFNOTE_MAX XEN_ELFNOTE_MOD_START_PFN
+#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUPPORTED_FEATURES
/*
* System information exported through crash notes.
--- a/xen/include/public/features.h
+++ b/xen/include/public/features.h
@@ -75,7 +75,10 @@
#define XENFEAT_hvm_safe_pvclock 9
/* x86: pirq can be used by HVM guests */
-#define XENFEAT_hvm_pirqs 10
+#define XENFEAT_hvm_pirqs 10
+
+/* operation as Dom0 is supported */
+#define XENFEAT_dom0 11
#define XENFEAT_NR_SUBMAPS 1
--- a/xen/include/xen/libelf.h
+++ b/xen/include/xen/libelf.h
@@ -179,6 +179,8 @@ const elf_sym *elf_sym_by_index(struct e
const char *elf_note_name(struct elf_binary *elf, const elf_note * note);
const void *elf_note_desc(struct elf_binary *elf, const elf_note * note);
uint64_t elf_note_numeric(struct elf_binary *elf, const elf_note * note);
+uint64_t elf_note_numeric_array(struct elf_binary *, const elf_note *,
+ unsigned int unitsz, unsigned int idx);
const elf_note *elf_note_next(struct elf_binary *elf, const elf_note * note);
int elf_is_elfbinary(const void *image);

View File

@ -0,0 +1,43 @@
# HG changeset patch
# User Tim Deegan <Tim.Deegan@citrix.com>
# Date 1311608493 -3600
# Node ID aa54b8175954bd6ffeb3bcf72e782e133896b388
# Parent 9dbbf1631193bb6df679f5eaaee192ef4ef91fd9
VT-d: always clean up dpci timers.
If a VM has all its PCI devices deassigned, need_iommu(d) becomes
false but it might still have DPCI EOI timers that were init_timer()d
but not yet kill_timer()d. That causes xen to crash later because the
linked list of inactive timers gets corrupted, e.g.:
(XEN) Xen call trace:
(XEN) [<ffff82c480126256>] set_timer+0x1c2/0x24f
(XEN) [<ffff82c48011fbf8>] schedule+0x129/0x5dd
(XEN) [<ffff82c480122c1e>] __do_softirq+0x7e/0x89
(XEN) [<ffff82c480122c9d>] do_softirq+0x26/0x28
(XEN) [<ffff82c480153c85>] idle_loop+0x5a/0x5c
(XEN)
(XEN)
(XEN) ****************************************
(XEN) Panic on CPU 0:
(XEN) Assertion 'entry->next->prev == entry' failed at
/local/scratch/tdeegan/xen-unstable.hg/xen/include:172
(XEN) ****************************************
The following patch makes sure that the domain destruction path always
clears up the DPCI state even if !needs_iommu(d).
Signed-off-by: Tim Deegan <Tim.Deegan@citrix.com>
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -220,9 +220,6 @@ static void pci_clean_dpci_irqs(struct d
if ( !iommu_enabled )
return;
- if ( !need_iommu(d) )
- return;
-
spin_lock(&d->event_lock);
hvm_irq_dpci = domain_get_irq_dpci(d);
if ( hvm_irq_dpci != NULL )

View File

@ -0,0 +1,64 @@
# HG changeset patch
# User Jan Beulich <jbeulich@novell.com>
# Date 1311608539 -3600
# Node ID b07b6fa766562c990b1d1e59af032feda15c2edb
# Parent aa54b8175954bd6ffeb3bcf72e782e133896b388
x86-64/MMCFG: correct base address computation for regions not starting at bus 0
As per the specification, the base address reported by ACPI is the one
that would be used if the region started at bus 0. Hence the
start_bus_number offset needs to be added not only to the virtual
address, but also the physical one when establishing the mapping, and
it then needs to be subtracted when obtaining the virtual address for
doing accesses.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
--- a/xen/arch/x86/x86_64/mmconfig_64.c
+++ b/xen/arch/x86/x86_64/mmconfig_64.c
@@ -25,7 +25,7 @@ struct mmcfg_virt {
static struct mmcfg_virt *pci_mmcfg_virt;
static int __initdata mmcfg_pci_segment_shift;
-static char __iomem *get_virt(unsigned int seg, unsigned bus)
+static char __iomem *get_virt(unsigned int seg, unsigned int *bus)
{
struct acpi_mcfg_allocation *cfg;
int cfg_num;
@@ -33,9 +33,11 @@ static char __iomem *get_virt(unsigned i
for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) {
cfg = pci_mmcfg_virt[cfg_num].cfg;
if (cfg->pci_segment == seg &&
- (cfg->start_bus_number <= bus) &&
- (cfg->end_bus_number >= bus))
+ (cfg->start_bus_number <= *bus) &&
+ (cfg->end_bus_number >= *bus)) {
+ *bus -= cfg->start_bus_number;
return pci_mmcfg_virt[cfg_num].virt;
+ }
}
/* Fall back to type 0 */
@@ -46,7 +48,7 @@ static char __iomem *pci_dev_base(unsign
{
char __iomem *addr;
- addr = get_virt(seg, bus);
+ addr = get_virt(seg, &bus);
if (!addr)
return NULL;
return addr + ((bus << 20) | (devfn << 12));
@@ -121,8 +123,11 @@ static void __iomem * __init mcfg_iorema
if (virt + size < virt || virt + size > PCI_MCFG_VIRT_END)
return NULL;
- map_pages_to_xen(virt, cfg->address >> PAGE_SHIFT,
- size >> PAGE_SHIFT, PAGE_HYPERVISOR_NOCACHE);
+ if (map_pages_to_xen(virt,
+ (cfg->address >> PAGE_SHIFT) +
+ (cfg->start_bus_number << (20 - PAGE_SHIFT)),
+ size >> PAGE_SHIFT, PAGE_HYPERVISOR_NOCACHE))
+ return NULL;
return (void __iomem *) virt;
}

View File

@ -0,0 +1,389 @@
# HG changeset patch
# User Jan Beulich <jbeulich@novell.com>
# Date 1311608606 -3600
# Node ID e8d1c8f074babcb0e4511393106e80a918a38204
# Parent e1717d180897e6e7a04d83a41d86b35ac16912b9
x86-64/MMCFG: pass down firmware (ACPI) reservation status of used memory space
Reserving the MMCFG address range(s) in E820 is specified to only be
optional for the firmware to do. The requirement is to have them
reserved in ACPI resources. Those, however, aren't directly visible to
Xen as they require the ACPI interpreter to be active. Thus, if a
range isn't reserved in E820, we should not completely disable use of
MMCFG on the respective bus range, but rather keep it disabled until
Dom0 can pass down information on the ACPI reservation status (though
a new physdevop hypercall).
Signed-off-by: Jan Beulich <jbeulich@novell.com>
--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
@@ -16,6 +16,10 @@
#include <xsm/xsm.h>
#include <asm/p2m.h>
+#ifdef CONFIG_X86_64
+#include "x86_64/mmconfig.h"
+#endif
+
#ifndef COMPAT
typedef long ret_t;
#endif
@@ -515,6 +519,24 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
break;
}
+#ifdef __x86_64__
+ case PHYSDEVOP_pci_mmcfg_reserved: {
+ struct physdev_pci_mmcfg_reserved info;
+
+ ret = -EPERM;
+ if ( !IS_PRIV(current->domain) )
+ break;
+
+ ret = -EFAULT;
+ if ( copy_from_guest(&info, arg, 1) )
+ break;
+
+ ret = pci_mmcfg_reserved(info.address, info.segment,
+ info.start_bus, info.end_bus, info.flags);
+ break;
+ }
+#endif
+
case PHYSDEVOP_restore_msi: {
struct physdev_restore_msi restore_msi;
struct pci_dev *pdev;
--- a/xen/arch/x86/x86_64/mmconfig.h
+++ b/xen/arch/x86/x86_64/mmconfig.h
@@ -84,6 +84,11 @@ extern int pci_mmcfg_config_num;
extern struct acpi_mcfg_allocation *pci_mmcfg_config;
/* function prototypes */
+struct acpi_table_header;
int acpi_parse_mcfg(struct acpi_table_header *header);
+int pci_mmcfg_reserved(uint64_t address, unsigned int segment,
+ unsigned int start_bus, unsigned int end_bus,
+ unsigned int flags);
int pci_mmcfg_arch_init(void);
-void pci_mmcfg_arch_free(void);
+int pci_mmcfg_arch_enable(unsigned int);
+void pci_mmcfg_arch_disable(unsigned int);
--- a/xen/arch/x86/x86_64/mmconfig-shared.c
+++ b/xen/arch/x86/x86_64/mmconfig-shared.c
@@ -22,10 +22,10 @@
#include <asm/e820.h>
#include <asm/msr.h>
#include <asm/msr-index.h>
+#include <public/physdev.h>
#include "mmconfig.h"
-static int __initdata known_bridge;
unsigned int pci_probe = PCI_PROBE_CONF1 | PCI_PROBE_MMCONF;
static void __init parse_mmcfg(char *s)
@@ -316,26 +316,21 @@ static int __init pci_mmcfg_check_hostbr
return name != NULL;
}
-typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type);
-
static int __init is_mmconf_reserved(
- check_reserved_t is_reserved,
u64 addr, u64 size, int i,
- typeof(pci_mmcfg_config[0]) *cfg, int with_e820)
+ typeof(pci_mmcfg_config[0]) *cfg)
{
u64 old_size = size;
int valid = 0;
- while (!is_reserved(addr, addr + size - 1, E820_RESERVED)) {
+ while (!e820_all_mapped(addr, addr + size - 1, E820_RESERVED)) {
size >>= 1;
if (size < (16UL<<20))
break;
}
if (size >= (16UL<<20) || size == old_size) {
- printk(KERN_NOTICE
- "PCI: MCFG area at %lx reserved in %s\n",
- addr, with_e820?"E820":"ACPI motherboard resources");
+ printk(KERN_NOTICE "PCI: MCFG area at %lx reserved in E820\n", addr);
valid = 1;
if (old_size != size) {
@@ -352,15 +347,16 @@ static int __init is_mmconf_reserved(
return valid;
}
-static void __init pci_mmcfg_reject_broken(void)
+static bool_t __init pci_mmcfg_reject_broken(void)
{
typeof(pci_mmcfg_config[0]) *cfg;
int i;
+ bool_t valid = 1;
if ((pci_mmcfg_config_num == 0) ||
(pci_mmcfg_config == NULL) ||
(pci_mmcfg_config[0].address == 0))
- return;
+ return 0;
cfg = &pci_mmcfg_config[0];
@@ -374,27 +370,25 @@ static void __init pci_mmcfg_reject_brok
size = cfg->end_bus_number + 1 - cfg->start_bus_number;
size <<= 20;
printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx "
- "segment %hu buses %u - %u\n",
+ "segment %04x buses %02x - %02x\n",
i, (unsigned long)cfg->address, cfg->pci_segment,
(unsigned int)cfg->start_bus_number,
(unsigned int)cfg->end_bus_number);
- if (!is_mmconf_reserved(e820_all_mapped, addr, size, i, cfg, 1))
- goto reject;
+ if (!is_mmconf_reserved(addr, size, i, cfg) ||
+ pci_mmcfg_arch_enable(i)) {
+ pci_mmcfg_arch_disable(i);
+ valid = 0;
+ }
}
- return;
-
-reject:
- printk(KERN_INFO "PCI: Not using MMCONFIG.\n");
- pci_mmcfg_arch_free();
- xfree(pci_mmcfg_config);
- pci_mmcfg_config = NULL;
- pci_mmcfg_config_num = 0;
+ return valid;
}
void __init acpi_mmcfg_init(void)
{
+ bool_t valid = 1;
+
/* MMCONFIG disabled */
if ((pci_probe & PCI_PROBE_MMCONF) == 0)
return;
@@ -403,16 +397,17 @@ void __init acpi_mmcfg_init(void)
if (!(pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF))
return;
- /* for late to exit */
- if (known_bridge)
- return;
-
- if (pci_mmcfg_check_hostbridge())
- known_bridge = 1;
+ if (pci_mmcfg_check_hostbridge()) {
+ unsigned int i;
- if (!known_bridge) {
+ pci_mmcfg_arch_init();
+ for (i = 0; i < pci_mmcfg_config_num; ++i)
+ if (pci_mmcfg_arch_enable(i))
+ valid = 0;
+ } else {
acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
- pci_mmcfg_reject_broken();
+ pci_mmcfg_arch_init();
+ valid = pci_mmcfg_reject_broken();
}
if ((pci_mmcfg_config_num == 0) ||
@@ -420,9 +415,41 @@ void __init acpi_mmcfg_init(void)
(pci_mmcfg_config[0].address == 0))
return;
- if (pci_mmcfg_arch_init()) {
+ if (valid)
pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
+}
+
+int pci_mmcfg_reserved(uint64_t address, unsigned int segment,
+ unsigned int start_bus, unsigned int end_bus,
+ unsigned int flags)
+{
+ unsigned int i;
+
+ if (flags & ~XEN_PCI_MMCFG_RESERVED)
+ return -EINVAL;
+
+ for (i = 0; i < pci_mmcfg_config_num; ++i) {
+ const typeof(pci_mmcfg_config[0]) *cfg = &pci_mmcfg_config[i];
+
+ if (cfg->pci_segment == segment &&
+ cfg->start_bus_number == start_bus &&
+ cfg->end_bus_number == end_bus) {
+ if (cfg->address != address) {
+ printk(KERN_WARNING
+ "Base address presented for segment %04x bus %02x-%02x"
+ " (%08" PRIx64 ") does not match previously obtained"
+ " one (%08" PRIx64 ")\n",
+ segment, start_bus, end_bus, address, cfg->address);
+ return -EIO;
+ }
+ if (flags & XEN_PCI_MMCFG_RESERVED)
+ return pci_mmcfg_arch_enable(i);
+ pci_mmcfg_arch_disable(i);
+ return 0;
+ }
}
+
+ return -ENODEV;
}
/**
--- a/xen/arch/x86/x86_64/mmconfig_64.c
+++ b/xen/arch/x86/x86_64/mmconfig_64.c
@@ -112,7 +112,8 @@ int pci_mmcfg_write(unsigned int seg, un
return 0;
}
-static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg)
+static void __iomem *mcfg_ioremap(const struct acpi_mcfg_allocation *cfg,
+ unsigned int prot)
{
unsigned long virt, size;
@@ -126,19 +127,55 @@ static void __iomem * __init mcfg_iorema
if (map_pages_to_xen(virt,
(cfg->address >> PAGE_SHIFT) +
(cfg->start_bus_number << (20 - PAGE_SHIFT)),
- size >> PAGE_SHIFT, PAGE_HYPERVISOR_NOCACHE))
+ size >> PAGE_SHIFT, prot))
return NULL;
return (void __iomem *) virt;
}
+int pci_mmcfg_arch_enable(unsigned int idx)
+{
+ const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg;
+
+ if (pci_mmcfg_virt[idx].virt)
+ return 0;
+ pci_mmcfg_virt[idx].virt = mcfg_ioremap(cfg, PAGE_HYPERVISOR_NOCACHE);
+ if (!pci_mmcfg_virt[idx].virt) {
+ printk(KERN_ERR "PCI: Cannot map MCFG aperture for segment %04x\n",
+ cfg->pci_segment);
+ return -ENOMEM;
+ }
+ printk(KERN_INFO "PCI: Using MCFG for segment %04x bus %02x-%02x\n",
+ cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number);
+ return 0;
+}
+
+void pci_mmcfg_arch_disable(unsigned int idx)
+{
+ const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg;
+
+ pci_mmcfg_virt[idx].virt = NULL;
+ /*
+ * Don't use destroy_xen_mappings() here, or make sure that at least
+ * the necessary L4 entries get populated (so that they get properly
+ * propagated to guest domains' page tables).
+ */
+ mcfg_ioremap(cfg, 0);
+ printk(KERN_WARNING "PCI: Not using MCFG for segment %04x bus %02x-%02x\n",
+ cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number);
+}
+
int __init pci_mmcfg_arch_init(void)
{
int i;
+ if (pci_mmcfg_virt)
+ return 0;
+
pci_mmcfg_virt = xmalloc_array(struct mmcfg_virt, pci_mmcfg_config_num);
if (pci_mmcfg_virt == NULL) {
printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n");
+ pci_mmcfg_config_num = 0;
return 0;
}
memset(pci_mmcfg_virt, 0, sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num);
@@ -149,34 +186,5 @@ int __init pci_mmcfg_arch_init(void)
++mmcfg_pci_segment_shift;
}
mmcfg_pci_segment_shift += 20;
- for (i = 0; i < pci_mmcfg_config_num; ++i) {
- pci_mmcfg_virt[i].virt = mcfg_ioremap(&pci_mmcfg_config[i]);
- if (!pci_mmcfg_virt[i].virt) {
- printk(KERN_ERR "PCI: Cannot map mmconfig aperture for "
- "segment %d\n",
- pci_mmcfg_config[i].pci_segment);
- pci_mmcfg_arch_free();
- return 0;
- }
- }
return 1;
}
-
-void __init pci_mmcfg_arch_free(void)
-{
- int i;
-
- if (pci_mmcfg_virt == NULL)
- return;
-
- for (i = 0; i < pci_mmcfg_config_num; ++i) {
- if (pci_mmcfg_virt[i].virt) {
- iounmap(pci_mmcfg_virt[i].virt);
- pci_mmcfg_virt[i].virt = NULL;
- pci_mmcfg_virt[i].cfg = NULL;
- }
- }
-
- xfree(pci_mmcfg_virt);
- pci_mmcfg_virt = NULL;
-}
--- a/xen/arch/x86/x86_64/physdev.c
+++ b/xen/arch/x86/x86_64/physdev.c
@@ -54,6 +54,10 @@
#define physdev_get_free_pirq compat_physdev_get_free_pirq
#define physdev_get_free_pirq_t physdev_get_free_pirq_compat_t
+#define xen_physdev_pci_mmcfg_reserved physdev_pci_mmcfg_reserved
+CHECK_physdev_pci_mmcfg_reserved;
+#undef xen_physdev_pci_mmcfg_reserved
+
#define COMPAT
#undef guest_handle_okay
#define guest_handle_okay compat_handle_okay
--- a/xen/include/public/physdev.h
+++ b/xen/include/public/physdev.h
@@ -255,6 +255,19 @@ struct physdev_get_free_pirq {
typedef struct physdev_get_free_pirq physdev_get_free_pirq_t;
DEFINE_XEN_GUEST_HANDLE(physdev_get_free_pirq_t);
+#define XEN_PCI_MMCFG_RESERVED 0x1
+
+#define PHYSDEVOP_pci_mmcfg_reserved 24
+struct physdev_pci_mmcfg_reserved {
+ uint64_t address;
+ uint16_t segment;
+ uint8_t start_bus;
+ uint8_t end_bus;
+ uint32_t flags;
+};
+typedef struct physdev_pci_mmcfg_reserved physdev_pci_mmcfg_reserved_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_pci_mmcfg_reserved_t);
+
/*
* Notify that some PIRQ-bound event channels have been unmasked.
* ** This command is obsolete since interface version 0x00030202 and is **
--- a/xen/include/xlat.lst
+++ b/xen/include/xlat.lst
@@ -60,6 +60,7 @@
! memory_map memory.h
! memory_reservation memory.h
! pod_target memory.h
+? physdev_pci_mmcfg_reserved physdev.h
! sched_poll sched.h
? sched_remote_shutdown sched.h
? sched_shutdown sched.h

View File

@ -0,0 +1,72 @@
References: bnc#712051, CVE-2011-3131
# HG changeset patch
# User Tim Deegan <Tim.Deegan@citrix.com>
# Date 1313144964 -3600
# Node ID 537ed3b74b3f13267cfb3eb0e1483f432f3685cd
# Parent 1f08b380d4386cdd6714786a9163e5f51aecab5d
Passthrough: disable bus-mastering on any card that causes an IOMMU fault.
This stops the card from raising back-to-back faults and live-locking
the CPU that handles them.
Signed-off-by: Tim Deegan <tim@xen.org>
Acked-by: Wei Wang2 <wei.wang2@amd.com>
Acked-by: Allen M Kay <allen.m.kay@intel.com>
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -462,7 +462,7 @@ static hw_irq_controller iommu_msi_type
static void parse_event_log_entry(u32 entry[])
{
- u16 domain_id, device_id;
+ u16 domain_id, device_id, bdf, cword;
u32 code;
u64 *addr;
char * event_str[] = {"ILLEGAL_DEV_TABLE_ENTRY",
@@ -497,6 +497,18 @@ static void parse_event_log_entry(u32 en
"%s: domain = %d, device id = 0x%04x, "
"fault address = 0x%"PRIx64"\n",
event_str[code-1], domain_id, device_id, *addr);
+
+ /* Tell the device to stop DMAing; we can't rely on the guest to
+ * control it for us. */
+ for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ )
+ if ( get_dma_requestor_id(bdf) == device_id )
+ {
+ cword = pci_conf_read16(PCI_BUS(bdf), PCI_SLOT(bdf),
+ PCI_FUNC(bdf), PCI_COMMAND);
+ pci_conf_write16(PCI_BUS(bdf), PCI_SLOT(bdf),
+ PCI_FUNC(bdf), PCI_COMMAND,
+ cword & ~PCI_COMMAND_MASTER);
+ }
}
else
{
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -887,7 +887,7 @@ static void iommu_page_fault(int irq, vo
while (1)
{
u8 fault_reason;
- u16 source_id;
+ u16 source_id, cword;
u32 data;
u64 guest_addr;
int type;
@@ -920,6 +920,14 @@ static void iommu_page_fault(int irq, vo
iommu_page_fault_do_one(iommu, type, fault_reason,
source_id, guest_addr);
+ /* Tell the device to stop DMAing; we can't rely on the guest to
+ * control it for us. */
+ cword = pci_conf_read16(PCI_BUS(source_id), PCI_SLOT(source_id),
+ PCI_FUNC(source_id), PCI_COMMAND);
+ pci_conf_write16(PCI_BUS(source_id), PCI_SLOT(source_id),
+ PCI_FUNC(source_id), PCI_COMMAND,
+ cword & ~PCI_COMMAND_MASTER);
+
fault_index++;
if ( fault_index > cap_num_fault_regs(iommu->cap) )
fault_index = 0;

View File

@ -0,0 +1,55 @@
# HG changeset patch
# User Jan Beulich <jbeulich@novell.com>
# Date 1313226769 -3600
# Node ID 8f647d409196f1d018f6284af03d1625cf8f93af
# Parent 537ed3b74b3f13267cfb3eb0e1483f432f3685cd
VT-d: don't reject valid DMAR/ATSR tables on systems with multiple PCI segments
On multi-PCI-segment systems, each segment has to be expected to have
an include-all DRHD and an all-ports ATSR, so the firmware consistency
check incorrectly rejects valid configurations there (which is
particularly problematic when the firmware also pre-enabled x2apic
mode, as the system will panic in that case due to being unable to
enable interrupt remapping). Thus constrain the check to just segment
0 for now; once full multi-segment support is there (which I'm working
on), it can be revisited whether we'd want to track this per segment,
or whether we trust the firmware of such large systems.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
--- a/xen/drivers/passthrough/vtd/dmar.c
+++ b/xen/drivers/passthrough/vtd/dmar.c
@@ -427,13 +427,14 @@ acpi_parse_one_drhd(struct acpi_dmar_ent
if ( iommu_verbose )
dprintk(VTDPREFIX, " flags: INCLUDE_ALL\n");
/* Only allow one INCLUDE_ALL */
- if ( include_all )
+ if ( drhd->segment == 0 && include_all )
{
dprintk(XENLOG_WARNING VTDPREFIX,
"Only one INCLUDE_ALL device scope is allowed\n");
ret = -EINVAL;
}
- include_all = 1;
+ if ( drhd->segment == 0 )
+ include_all = 1;
}
if ( ret )
@@ -633,13 +634,14 @@ acpi_parse_one_atsr(struct acpi_dmar_ent
if ( iommu_verbose )
dprintk(VTDPREFIX, " flags: ALL_PORTS\n");
/* Only allow one ALL_PORTS */
- if ( all_ports )
+ if ( atsr->segment == 0 && all_ports )
{
dprintk(XENLOG_WARNING VTDPREFIX,
"Only one ALL_PORTS device scope is allowed\n");
ret = -EINVAL;
}
- all_ports = 1;
+ if ( atsr->segment == 0 )
+ all_ports = 1;
}
if ( ret )

View File

@ -0,0 +1,29 @@
# HG changeset patch
# User Andrew Cooper <andrew.cooper3@citrix.com>
# Date 1313226868 -3600
# Node ID 68b903bb1b01b2a6ef9c6e8ead3be3c1c2208341
# Parent 67b883402736ef1746cd6654da4c898f70f40723
x86: IRQ fix incorrect logic in __clear_irq_vector
In the old code, tmp_mask is the cpu_and of cfg->cpu_mask and
cpu_online_map. However, in the usual case of moving an IRQ from one
PCPU to another because the scheduler decides its a good idea,
cfg->cpu_mask and cfg->old_cpu_mask do not intersect. This causes the
old cpu vector_irq table to keep the irq reference when it shouldn't.
This leads to a resource leak if a domain is shut down wile an irq has
a move pending, which results in Xen's create_irq() eventually failing
with -ENOSPC when all vector_irq tables are full of stale references.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -190,6 +190,7 @@ static void __clear_irq_vector(int irq)
if (likely(!cfg->move_in_progress))
return;
+ cpus_and(tmp_mask, cfg->old_cpu_mask, cpu_online_map);
for_each_cpu_mask(cpu, tmp_mask) {
for (vector = FIRST_DYNAMIC_VECTOR; vector <= LAST_DYNAMIC_VECTOR;
vector++) {

295
23766-x86-msi-vf-bars.patch Normal file
View File

@ -0,0 +1,295 @@
# HG changeset patch
# User Jan Beulich <jbeulich@novell.com>
# Date 1313226898 -3600
# Node ID 8d6edc3d26d26931f3732a2008fb4818bc7bab2d
# Parent 68b903bb1b01b2a6ef9c6e8ead3be3c1c2208341
x86/PCI-MSI: properly determine VF BAR values
As was discussed a couple of times on this list, SR-IOV virtual
functions have their BARs read as zero - the physical function's
SR-IOV capability structure must be consulted instead. The bogus
warnings people complained about are being eliminated with this
change.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -522,12 +522,48 @@ static int msi_capability_init(struct pc
return 0;
}
-static u64 read_pci_mem_bar(u8 bus, u8 slot, u8 func, u8 bir)
+static u64 read_pci_mem_bar(u8 bus, u8 slot, u8 func, u8 bir, int vf)
{
u8 limit;
- u32 addr;
+ u32 addr, base = PCI_BASE_ADDRESS_0, disp = 0;
- switch ( pci_conf_read8(bus, slot, func, PCI_HEADER_TYPE) & 0x7f )
+ if ( vf >= 0 )
+ {
+ struct pci_dev *pdev = pci_get_pdev(bus, PCI_DEVFN(slot, func));
+ unsigned int pos = pci_find_ext_capability(0, bus,
+ PCI_DEVFN(slot, func),
+ PCI_EXT_CAP_ID_SRIOV);
+ u16 ctrl = pci_conf_read16(bus, slot, func, pos + PCI_SRIOV_CTRL);
+ u16 num_vf = pci_conf_read16(bus, slot, func, pos + PCI_SRIOV_NUM_VF);
+ u16 offset = pci_conf_read16(bus, slot, func,
+ pos + PCI_SRIOV_VF_OFFSET);
+ u16 stride = pci_conf_read16(bus, slot, func,
+ pos + PCI_SRIOV_VF_STRIDE);
+
+ if ( !pdev || !pos ||
+ !(ctrl & PCI_SRIOV_CTRL_VFE) ||
+ !(ctrl & PCI_SRIOV_CTRL_MSE) ||
+ !num_vf || !offset || (num_vf > 1 && !stride) ||
+ bir >= PCI_SRIOV_NUM_BARS ||
+ !pdev->vf_rlen[bir] )
+ return 0;
+ base = pos + PCI_SRIOV_BAR;
+ vf -= PCI_BDF(bus, slot, func) + offset;
+ if ( vf < 0 || (vf && vf % stride) )
+ return 0;
+ if ( stride )
+ {
+ if ( vf % stride )
+ return 0;
+ vf /= stride;
+ }
+ if ( vf >= num_vf )
+ return 0;
+ BUILD_BUG_ON(ARRAY_SIZE(pdev->vf_rlen) != PCI_SRIOV_NUM_BARS);
+ disp = vf * pdev->vf_rlen[bir];
+ limit = PCI_SRIOV_NUM_BARS;
+ }
+ else switch ( pci_conf_read8(bus, slot, func, PCI_HEADER_TYPE) & 0x7f )
{
case PCI_HEADER_TYPE_NORMAL:
limit = 6;
@@ -544,7 +580,7 @@ static u64 read_pci_mem_bar(u8 bus, u8 s
if ( bir >= limit )
return 0;
- addr = pci_conf_read32(bus, slot, func, PCI_BASE_ADDRESS_0 + bir * 4);
+ addr = pci_conf_read32(bus, slot, func, base + bir * 4);
if ( (addr & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO )
return 0;
if ( (addr & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64 )
@@ -552,11 +588,10 @@ static u64 read_pci_mem_bar(u8 bus, u8 s
addr &= PCI_BASE_ADDRESS_MEM_MASK;
if ( ++bir >= limit )
return 0;
- return addr |
- ((u64)pci_conf_read32(bus, slot, func,
- PCI_BASE_ADDRESS_0 + bir * 4) << 32);
+ return addr + disp +
+ ((u64)pci_conf_read32(bus, slot, func, base + bir * 4) << 32);
}
- return addr & PCI_BASE_ADDRESS_MEM_MASK;
+ return (addr & PCI_BASE_ADDRESS_MEM_MASK) + disp;
}
/**
@@ -629,11 +664,29 @@ static int msix_capability_init(struct p
if ( !dev->msix_nr_entries )
{
+ u8 pbus, pslot, pfunc;
+ int vf;
u64 pba_paddr;
u32 pba_offset;
+ if ( !dev->info.is_virtfn )
+ {
+ pbus = bus;
+ pslot = slot;
+ pfunc = func;
+ vf = -1;
+ }
+ else
+ {
+ pbus = dev->info.physfn.bus;
+ pslot = PCI_SLOT(dev->info.physfn.devfn);
+ pfunc = PCI_FUNC(dev->info.physfn.devfn);
+ vf = PCI_BDF2(dev->bus, dev->devfn);
+ }
+
ASSERT(!dev->msix_used_entries);
- WARN_ON(msi->table_base != read_pci_mem_bar(bus, slot, func, bir));
+ WARN_ON(msi->table_base !=
+ read_pci_mem_bar(pbus, pslot, pfunc, bir, vf));
dev->msix_nr_entries = nr_entries;
dev->msix_table.first = PFN_DOWN(table_paddr);
@@ -645,7 +698,7 @@ static int msix_capability_init(struct p
pba_offset = pci_conf_read32(bus, slot, func,
msix_pba_offset_reg(pos));
bir = (u8)(pba_offset & PCI_MSIX_BIRMASK);
- pba_paddr = read_pci_mem_bar(bus, slot, func, bir);
+ pba_paddr = read_pci_mem_bar(pbus, pslot, pfunc, bir, vf);
WARN_ON(!pba_paddr);
pba_paddr += pba_offset & ~PCI_MSIX_BIRMASK;
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -145,6 +145,7 @@ void pci_enable_acs(struct pci_dev *pdev
int pci_add_device(u8 bus, u8 devfn, const struct pci_dev_info *info)
{
struct pci_dev *pdev;
+ unsigned int slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
const char *pdev_type;
int ret = -ENOMEM;
@@ -153,7 +154,14 @@ int pci_add_device(u8 bus, u8 devfn, con
else if (info->is_extfn)
pdev_type = "extended function";
else if (info->is_virtfn)
+ {
+ spin_lock(&pcidevs_lock);
+ pdev = pci_get_pdev(info->physfn.bus, info->physfn.devfn);
+ spin_unlock(&pcidevs_lock);
+ if ( !pdev )
+ pci_add_device(info->physfn.bus, info->physfn.devfn, NULL);
pdev_type = "virtual function";
+ }
else
return -EINVAL;
@@ -164,6 +172,70 @@ int pci_add_device(u8 bus, u8 devfn, con
if ( info )
pdev->info = *info;
+ else if ( !pdev->vf_rlen[0] )
+ {
+ unsigned int pos = pci_find_ext_capability(0, bus, devfn,
+ PCI_EXT_CAP_ID_SRIOV);
+ u16 ctrl = pci_conf_read16(bus, slot, func, pos + PCI_SRIOV_CTRL);
+
+ if ( !pos )
+ /* Nothing */;
+ else if ( !(ctrl & (PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE)) )
+ {
+ unsigned int i;
+
+ BUILD_BUG_ON(ARRAY_SIZE(pdev->vf_rlen) != PCI_SRIOV_NUM_BARS);
+ for ( i = 0; i < PCI_SRIOV_NUM_BARS; ++i )
+ {
+ unsigned int idx = pos + PCI_SRIOV_BAR + i * 4;
+ u32 bar = pci_conf_read32(bus, slot, func, idx);
+ u32 hi = 0;
+
+ if ( (bar & PCI_BASE_ADDRESS_SPACE) ==
+ PCI_BASE_ADDRESS_SPACE_IO )
+ {
+ printk(XENLOG_WARNING "SR-IOV device %02x:%02x.%x with vf"
+ " BAR%u in IO space\n",
+ bus, slot, func, i);
+ continue;
+ }
+ pci_conf_write32(bus, slot, func, idx, ~0);
+ if ( (bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK) ==
+ PCI_BASE_ADDRESS_MEM_TYPE_64 )
+ {
+ if ( i >= PCI_SRIOV_NUM_BARS )
+ {
+ printk(XENLOG_WARNING "SR-IOV device %02x:%02x.%x with"
+ " 64-bit vf BAR in last slot\n",
+ bus, slot, func);
+ break;
+ }
+ hi = pci_conf_read32(bus, slot, func, idx + 4);
+ pci_conf_write32(bus, slot, func, idx + 4, ~0);
+ }
+ pdev->vf_rlen[i] = pci_conf_read32(bus, slot, func, idx) &
+ PCI_BASE_ADDRESS_MEM_MASK;
+ if ( (bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK) ==
+ PCI_BASE_ADDRESS_MEM_TYPE_64 )
+ {
+ pdev->vf_rlen[i] |= (u64)pci_conf_read32(bus, slot, func,
+ idx + 4) << 32;
+ pci_conf_write32(bus, slot, func, idx + 4, hi);
+ }
+ else if ( pdev->vf_rlen[i] )
+ pdev->vf_rlen[i] |= (u64)~0 << 32;
+ pci_conf_write32(bus, slot, func, idx, bar);
+ pdev->vf_rlen[i] = -pdev->vf_rlen[i];
+ if ( (bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK) ==
+ PCI_BASE_ADDRESS_MEM_TYPE_64 )
+ ++i;
+ }
+ }
+ else
+ printk(XENLOG_WARNING "SR-IOV device %02x:%02x.%x has its virtual"
+ " functions already enabled (%04x)\n",
+ bus, slot, func, ctrl);
+ }
ret = 0;
if ( !pdev->domain )
@@ -183,7 +255,7 @@ int pci_add_device(u8 bus, u8 devfn, con
out:
spin_unlock(&pcidevs_lock);
printk(XENLOG_DEBUG "PCI add %s %02x:%02x.%x\n", pdev_type,
- bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+ bus, slot, func);
return ret;
}
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -57,6 +57,7 @@ struct pci_dev {
const u8 bus;
const u8 devfn;
struct pci_dev_info info;
+ u64 vf_rlen[6];
};
#define for_each_pdev(domain, pdev) \
--- a/xen/include/xen/pci_regs.h
+++ b/xen/include/xen/pci_regs.h
@@ -425,7 +425,7 @@
#define PCI_EXT_CAP_ID_ACS 13
#define PCI_EXT_CAP_ID_ARI 14
#define PCI_EXT_CAP_ID_ATS 15
-#define PCI_EXT_CAP_ID_IOV 16
+#define PCI_EXT_CAP_ID_SRIOV 16
/* Advanced Error Reporting */
#define PCI_ERR_UNCOR_STATUS 4 /* Uncorrectable Error Status */
@@ -545,4 +545,35 @@
#define PCI_ACS_CTRL 0x06 /* ACS Control Register */
#define PCI_ACS_EGRESS_CTL_V 0x08 /* ACS Egress Control Vector */
+/* Single Root I/O Virtualization */
+#define PCI_SRIOV_CAP 0x04 /* SR-IOV Capabilities */
+#define PCI_SRIOV_CAP_VFM 0x01 /* VF Migration Capable */
+#define PCI_SRIOV_CAP_INTR(x) ((x) >> 21) /* Interrupt Message Number */
+#define PCI_SRIOV_CTRL 0x08 /* SR-IOV Control */
+#define PCI_SRIOV_CTRL_VFE 0x01 /* VF Enable */
+#define PCI_SRIOV_CTRL_VFM 0x02 /* VF Migration Enable */
+#define PCI_SRIOV_CTRL_INTR 0x04 /* VF Migration Interrupt Enable */
+#define PCI_SRIOV_CTRL_MSE 0x08 /* VF Memory Space Enable */
+#define PCI_SRIOV_CTRL_ARI 0x10 /* ARI Capable Hierarchy */
+#define PCI_SRIOV_STATUS 0x0a /* SR-IOV Status */
+#define PCI_SRIOV_STATUS_VFM 0x01 /* VF Migration Status */
+#define PCI_SRIOV_INITIAL_VF 0x0c /* Initial VFs */
+#define PCI_SRIOV_TOTAL_VF 0x0e /* Total VFs */
+#define PCI_SRIOV_NUM_VF 0x10 /* Number of VFs */
+#define PCI_SRIOV_FUNC_LINK 0x12 /* Function Dependency Link */
+#define PCI_SRIOV_VF_OFFSET 0x14 /* First VF Offset */
+#define PCI_SRIOV_VF_STRIDE 0x16 /* Following VF Stride */
+#define PCI_SRIOV_VF_DID 0x1a /* VF Device ID */
+#define PCI_SRIOV_SUP_PGSIZE 0x1c /* Supported Page Sizes */
+#define PCI_SRIOV_SYS_PGSIZE 0x20 /* System Page Size */
+#define PCI_SRIOV_BAR 0x24 /* VF BAR0 */
+#define PCI_SRIOV_NUM_BARS 6 /* Number of VF BARs */
+#define PCI_SRIOV_VFM 0x3c /* VF Migration State Array Offset*/
+#define PCI_SRIOV_VFM_BIR(x) ((x) & 7) /* State BIR */
+#define PCI_SRIOV_VFM_OFFSET(x) ((x) & ~7) /* State Offset */
+#define PCI_SRIOV_VFM_UA 0x0 /* Inactive.Unavailable */
+#define PCI_SRIOV_VFM_MI 0x1 /* Dormant.MigrateIn */
+#define PCI_SRIOV_VFM_MO 0x2 /* Active.MigrateOut */
+#define PCI_SRIOV_VFM_AV 0x3 /* Active.Available */
+
#endif /* LINUX_PCI_REGS_H */

View File

@ -0,0 +1,61 @@
References: bnc#701686
# HG changeset patch
# User Jan Beulich <jbeulich@novell.com>
# Date 1313503555 -3600
# Node ID fc2be6cb89ad49efd90fe1b650f7efaab72f61b2
# Parent 5c1ebc117f9901bc155d2b92ae902a4144767dfb
x86: simplify (and fix) clear_IO_APIC{,_pin}()
These are used during bootup and (emergency) shutdown only, and their
only purpose is to get the actual IO-APIC's RTE(s) cleared.
Consequently, only the "raw" accessors should be used (and the ones
going through interrupt remapping code can be skipped), with the
exception of determining the delivery mode: This one must always go
through the interrupt remapping path, as in the VT-d case the actual
IO-APIC's RTE will have the delivery mode always set to zero (which
before possibly could have resulted in such an entry getting cleared
in the "raw" pass, though I haven't observed this case in practice).
Signed-off-by: Jan Beulich <jbeulich@novell.com>
--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -365,14 +365,12 @@ static void eoi_IO_APIC_irq(unsigned int
spin_unlock_irqrestore(&ioapic_lock, flags);
}
-#define clear_IO_APIC_pin(a,p) __clear_IO_APIC_pin(a,p,0)
-#define clear_IO_APIC_pin_raw(a,p) __clear_IO_APIC_pin(a,p,1)
-static void __clear_IO_APIC_pin(unsigned int apic, unsigned int pin, int raw)
+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
{
struct IO_APIC_route_entry entry;
/* Check delivery_mode to be sure we're not clearing an SMI pin */
- entry = ioapic_read_entry(apic, pin, raw);
+ entry = __ioapic_read_entry(apic, pin, FALSE);
if (entry.delivery_mode == dest_SMI)
return;
@@ -381,7 +379,7 @@ static void __clear_IO_APIC_pin(unsigned
*/
memset(&entry, 0, sizeof(entry));
entry.mask = 1;
- ioapic_write_entry(apic, pin, raw, entry);
+ __ioapic_write_entry(apic, pin, TRUE, entry);
}
static void clear_IO_APIC (void)
@@ -389,10 +387,8 @@ static void clear_IO_APIC (void)
int apic, pin;
for (apic = 0; apic < nr_ioapics; apic++) {
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
clear_IO_APIC_pin(apic, pin);
- clear_IO_APIC_pin_raw(apic, pin);
- }
}
}

363
23772-x86-trampoline.patch Normal file
View File

@ -0,0 +1,363 @@
# HG changeset patch
# User Jan Beulich <jbeulich@novell.com>
# Date 1313744066 -3600
# Node ID 29aeed4979a78f26519f5fde8a405f8438297ab9
# Parent fc2be6cb89ad49efd90fe1b650f7efaab72f61b2
x86: make run-time part of trampoline relocatable
In order to eliminate an initial hack in the EFI boot code (where
memory for the trampoline was just "claimed" instead of properly
allocated), the trampoline code must no longer make assumption on the
address at which it would be located. For the time being, the fixed
address is being retained for the traditional multiboot path.
As an additional benefit (at least from my pov) it allows confining
the visibility of the BOOT_TRAMPOLINE definition to just the boot
code.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
--- a/xen/arch/x86/boot/Makefile
+++ b/xen/arch/x86/boot/Makefile
@@ -2,8 +2,8 @@ obj-y += head.o
head.o: reloc.S
-BOOT_TRAMPOLINE := $(shell sed -n 's,^\#define[[:space:]]\{1\,\}BOOT_TRAMPOLINE[[:space:]]\{1\,\},,p' $(BASEDIR)/include/asm-x86/config.h)
+BOOT_TRAMPOLINE := $(shell sed -n 's,^\#define[[:space:]]\{1\,\}BOOT_TRAMPOLINE[[:space:]]\{1\,\},,p' head.S)
%.S: %.c
RELOC=$(BOOT_TRAMPOLINE) $(MAKE) -f build32.mk $@
-reloc.S: $(BASEDIR)/include/asm-x86/config.h
+reloc.S: head.S
--- a/xen/arch/x86/boot/head.S
+++ b/xen/arch/x86/boot/head.S
@@ -9,7 +9,7 @@
.text
.code32
-#undef bootsym_phys
+#define BOOT_TRAMPOLINE 0x7c000
#define sym_phys(sym) ((sym) - __XEN_VIRT_START)
#define bootsym_phys(sym) ((sym) - trampoline_start + BOOT_TRAMPOLINE)
@@ -189,6 +189,17 @@ __start:
mov %edi,sym_phys(idle_pg_table_l2) + (__PAGE_OFFSET>>18)
#endif
+ /* Apply relocations to bootstrap trampoline. */
+ mov $BOOT_TRAMPOLINE,%edx
+ mov $sym_phys(__trampoline_rel_start),%edi
+ mov %edx,sym_phys(trampoline_phys)
+1:
+ mov (%edi),%eax
+ add %edx,(%edi,%eax)
+ add $4,%edi
+ cmp $sym_phys(__trampoline_rel_stop),%edi
+ jb 1b
+
/* Copy bootstrap trampoline to low memory, below 1MB. */
mov $sym_phys(trampoline_start),%esi
mov $bootsym_phys(trampoline_start),%edi
--- a/xen/arch/x86/boot/trampoline.S
+++ b/xen/arch/x86/boot/trampoline.S
@@ -4,6 +4,13 @@
#undef bootsym
#define bootsym(s) ((s)-trampoline_start)
+#define bootsym_rel(sym, off, opnd...) \
+ bootsym(sym),##opnd; \
+111:; \
+ .pushsection .trampoline_rel, "a"; \
+ .long 111b - (off) - .; \
+ .popsection
+
.globl trampoline_realmode_entry
trampoline_realmode_entry:
mov %cs,%ax
@@ -17,11 +24,11 @@ trampoline_realmode_entry:
xor %ax, %ax
inc %ax
lmsw %ax # CR0.PE = 1 (enter protected mode)
- ljmpl $BOOT_CS32,$bootsym_phys(trampoline_protmode_entry)
+ ljmpl $BOOT_CS32,$bootsym_rel(trampoline_protmode_entry,6)
idt_48: .word 0, 0, 0 # base = limit = 0
gdt_48: .word 6*8-1
- .long bootsym_phys(trampoline_gdt)
+ .long bootsym_rel(trampoline_gdt,4)
trampoline_gdt:
/* 0x0000: unused */
.quad 0x0000000000000000
@@ -32,11 +39,16 @@ trampoline_gdt:
/* 0x0018: ring 0 data */
.quad 0x00cf92000000ffff
/* 0x0020: real-mode code @ BOOT_TRAMPOLINE */
- .long 0x0000ffff | ((BOOT_TRAMPOLINE & 0x00ffff) << 16)
- .long 0x00009a00 | ((BOOT_TRAMPOLINE & 0xff0000) >> 16)
+ .long 0x0000ffff
+ .long 0x00009a00
/* 0x0028: real-mode data @ BOOT_TRAMPOLINE */
- .long 0x0000ffff | ((BOOT_TRAMPOLINE & 0x00ffff) << 16)
- .long 0x00009200 | ((BOOT_TRAMPOLINE & 0xff0000) >> 16)
+ .long 0x0000ffff
+ .long 0x00009200
+
+ .pushsection .trampoline_rel, "a"
+ .long trampoline_gdt + BOOT_PSEUDORM_CS + 2 - .
+ .long trampoline_gdt + BOOT_PSEUDORM_DS + 2 - .
+ .popsection
.globl cpuid_ext_features
cpuid_ext_features:
@@ -66,11 +78,11 @@ trampoline_protmode_entry:
/* Load pagetable base register. */
mov $sym_phys(idle_pg_table),%eax
- add bootsym_phys(trampoline_xen_phys_start),%eax
+ add bootsym_rel(trampoline_xen_phys_start,4,%eax)
mov %eax,%cr3
/* Set up EFER (Extended Feature Enable Register). */
- mov bootsym_phys(cpuid_ext_features),%edi
+ mov bootsym_rel(cpuid_ext_features,4,%edi)
test $0x20100800,%edi /* SYSCALL/SYSRET, No Execute, Long Mode? */
jz .Lskip_efer
movl $MSR_EFER,%ecx
@@ -93,7 +105,7 @@ trampoline_protmode_entry:
#if defined(__x86_64__)
/* Now in compatibility mode. Long-jump into 64-bit mode. */
- ljmp $BOOT_CS64,$bootsym_phys(start64)
+ ljmp $BOOT_CS64,$bootsym_rel(start64,6)
.code64
start64:
--- a/xen/arch/x86/boot/wakeup.S
+++ b/xen/arch/x86/boot/wakeup.S
@@ -42,15 +42,13 @@ ENTRY(wakeup_start)
# boot trampoline is under 1M, and shift its start into
# %fs to reference symbols in that area
- movl $BOOT_TRAMPOLINE, %eax
- shrl $4, %eax
- movl %eax, %fs
+ mov wakesym(trampoline_seg), %fs
lidt %fs:bootsym(idt_48)
lgdt %fs:bootsym(gdt_48)
movw $1, %ax
lmsw %ax # Turn on CR0.PE
- ljmpl $BOOT_CS32, $bootsym_phys(wakeup_32)
+ ljmpl $BOOT_CS32, $bootsym_rel(wakeup_32, 6)
/* This code uses an extended set of video mode numbers. These include:
* Aliases for standard modes
@@ -103,6 +101,10 @@ real_magic: .long 0x12345678
.globl video_mode, video_flags
video_mode: .long 0
video_flags: .long 0
+trampoline_seg: .word BOOT_TRAMPOLINE >> 4
+ .pushsection .trampoline_seg, "a"
+ .long trampoline_seg - .
+ .popsection
.code32
@@ -114,11 +116,11 @@ wakeup_32:
mov $BOOT_DS, %eax
mov %eax, %ds
mov %eax, %ss
- mov $bootsym_phys(early_stack), %esp
+ mov $bootsym_rel(early_stack, 4, %esp)
# check saved magic again
mov $sym_phys(saved_magic), %eax
- add bootsym_phys(trampoline_xen_phys_start), %eax
+ add bootsym_rel(trampoline_xen_phys_start, 4, %eax)
mov (%eax), %eax
cmp $0x9abcdef0, %eax
jne bogus_saved_magic
@@ -131,12 +133,12 @@ wakeup_32:
/* Load pagetable base register */
mov $sym_phys(idle_pg_table),%eax
- add bootsym_phys(trampoline_xen_phys_start),%eax
+ add bootsym_rel(trampoline_xen_phys_start,4,%eax)
mov %eax,%cr3
/* Will cpuid feature change after resume? */
/* Set up EFER (Extended Feature Enable Register). */
- mov bootsym_phys(cpuid_ext_features),%edi
+ mov bootsym_rel(cpuid_ext_features,4,%edi)
test $0x20100800,%edi /* SYSCALL/SYSRET, No Execute, Long Mode? */
jz .Lskip_eferw
movl $MSR_EFER,%ecx
@@ -162,7 +164,7 @@ wakeup_32:
#if defined(__x86_64__)
/* Now in compatibility mode. Long-jump to 64-bit mode */
- ljmp $BOOT_CS64, $bootsym_phys(wakeup_64)
+ ljmp $BOOT_CS64, $bootsym_rel(wakeup_64,6)
.code64
wakeup_64:
--- a/xen/arch/x86/efi/boot.c
+++ b/xen/arch/x86/efi/boot.c
@@ -599,6 +599,9 @@ static void __init relocate_image(unsign
}
}
+extern const s32 __trampoline_rel_start[], __trampoline_rel_stop[];
+extern const s32 __trampoline_seg_start[], __trampoline_seg_stop[];
+
void EFIAPI __init __attribute__((__noreturn__))
efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable)
{
@@ -614,9 +617,10 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SY
EFI_GRAPHICS_OUTPUT_MODE_INFORMATION *mode_info;
EFI_FILE_HANDLE dir_handle;
union string section = { NULL }, name;
+ const s32 *trampoline_ptr;
struct e820entry *e;
u64 efer;
- bool_t base_video = 0, trampoline_okay = 0;
+ bool_t base_video = 0;
efi_ih = ImageHandle;
efi_bs = SystemTable->BootServices;
@@ -914,15 +918,27 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SY
dmi_efi_get_table((void *)(long)efi.smbios);
/* Allocate space for trampoline (in first Mb). */
- cfg.addr = BOOT_TRAMPOLINE;
+ cfg.addr = 0x100000;
cfg.size = trampoline_end - trampoline_start;
- status = efi_bs->AllocatePages(AllocateAddress, EfiLoaderData,
+ status = efi_bs->AllocatePages(AllocateMaxAddress, EfiLoaderData,
PFN_UP(cfg.size), &cfg.addr);
if ( EFI_ERROR(status) )
{
cfg.addr = 0;
- PrintErr(L"Note: Trampoline area is in use\r\n");
+ blexit(L"No memory for trampoline\r\n");
}
+ trampoline_phys = cfg.addr;
+ /* Apply relocations to trampoline. */
+ for ( trampoline_ptr = __trampoline_rel_start;
+ trampoline_ptr < __trampoline_rel_stop;
+ ++trampoline_ptr )
+ *(u32 *)(*trampoline_ptr + (long)trampoline_ptr) +=
+ trampoline_phys;
+ for ( trampoline_ptr = __trampoline_seg_start;
+ trampoline_ptr < __trampoline_seg_stop;
+ ++trampoline_ptr )
+ *(u16 *)(*trampoline_ptr + (long)trampoline_ptr) =
+ trampoline_phys >> 4;
/* Initialise L2 identity-map and xen page table entries (16MB). */
for ( i = 0; i < 8; ++i )
@@ -1096,14 +1112,8 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SY
e->type = type;
++e820nr;
}
- if ( type == E820_RAM && e->addr <= BOOT_TRAMPOLINE &&
- e->addr + e->size >= BOOT_TRAMPOLINE + cfg.size )
- trampoline_okay = 1;
}
- if ( !trampoline_okay )
- blexit(L"Trampoline area unavailable\r\n");
-
status = efi_bs->ExitBootServices(ImageHandle, map_key);
if ( EFI_ERROR(status) )
PrintErrMesg(L"Cannot exit boot services", status);
@@ -1117,7 +1127,7 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SY
efi_fw_vendor = (void *)efi_fw_vendor + DIRECTMAP_VIRT_START;
relocate_image(__XEN_VIRT_START - xen_phys_start);
- memcpy((void *)(long)BOOT_TRAMPOLINE, trampoline_start, cfg.size);
+ memcpy((void *)trampoline_phys, trampoline_start, cfg.size);
/* Set system registers and transfer control. */
asm volatile("pushq $0\n\tpopfq");
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -49,6 +49,8 @@
#define setup_trampoline() (bootsym_phys(trampoline_realmode_entry))
+unsigned long __read_mostly trampoline_phys;
+
/* Set if we find a B stepping CPU */
static int smp_b_stepping;
--- a/xen/arch/x86/x86_32/mm.c
+++ b/xen/arch/x86/x86_32/mm.c
@@ -22,6 +22,7 @@
#include <xen/lib.h>
#include <xen/init.h>
#include <xen/mm.h>
+#include <xen/pfn.h>
#include <xen/sched.h>
#include <xen/guest_access.h>
#include <asm/current.h>
@@ -164,8 +165,9 @@ void __init zap_low_mappings(l2_pgentry_
flush_all(FLUSH_TLB_GLOBAL);
/* Replace with mapping of the boot trampoline only. */
- map_pages_to_xen(BOOT_TRAMPOLINE, BOOT_TRAMPOLINE >> PAGE_SHIFT,
- 0x10, __PAGE_HYPERVISOR);
+ map_pages_to_xen(trampoline_phys, trampoline_phys >> PAGE_SHIFT,
+ PFN_UP(trampoline_end - trampoline_start),
+ __PAGE_HYPERVISOR);
}
void __init subarch_init_memory(void)
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -828,7 +828,7 @@ void __init zap_low_mappings(void)
flush_local(FLUSH_TLB_GLOBAL);
/* Replace with mapping of the boot trampoline only. */
- map_pages_to_xen(BOOT_TRAMPOLINE, BOOT_TRAMPOLINE >> PAGE_SHIFT,
+ map_pages_to_xen(trampoline_phys, trampoline_phys >> PAGE_SHIFT,
PFN_UP(trampoline_end - trampoline_start),
__PAGE_HYPERVISOR);
}
--- a/xen/arch/x86/xen.lds.S
+++ b/xen/arch/x86/xen.lds.S
@@ -103,6 +103,13 @@ SECTIONS
*(.init.data)
*(.init.data.rel)
*(.init.data.rel.*)
+ . = ALIGN(4);
+ __trampoline_rel_start = .;
+ *(.trampoline_rel)
+ __trampoline_rel_stop = .;
+ __trampoline_seg_start = .;
+ *(.trampoline_seg)
+ __trampoline_seg_stop = .;
} :text
. = ALIGN(32);
.init.setup : {
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -95,13 +95,13 @@
/* Primary stack is restricted to 8kB by guard pages. */
#define PRIMARY_STACK_SIZE 8192
-#define BOOT_TRAMPOLINE 0x7c000
+#ifndef __ASSEMBLY__
+extern unsigned long trampoline_phys;
#define bootsym_phys(sym) \
- (((unsigned long)&(sym)-(unsigned long)&trampoline_start)+BOOT_TRAMPOLINE)
+ (((unsigned long)&(sym)-(unsigned long)&trampoline_start)+trampoline_phys)
#define bootsym(sym) \
(*RELOC_HIDE((typeof(&(sym)))__va(__pa(&(sym))), \
- BOOT_TRAMPOLINE-__pa(trampoline_start)))
-#ifndef __ASSEMBLY__
+ trampoline_phys-__pa(trampoline_start)))
extern char trampoline_start[], trampoline_end[];
extern char trampoline_realmode_entry[];
extern unsigned int trampoline_xen_phys_start;

364
23774-x86_64-EFI-EDD.patch Normal file
View File

@ -0,0 +1,364 @@
# HG changeset patch
# User Jan Beulich <jbeulich@novell.com>
# Date 1313744120 -3600
# Node ID e35c5202625ef5534561f84352833ad9467d986c
# Parent dd90b59cb11c60c48e174c899190e2967341fe32
x86-64/EFI: construct EDD data from device path protocol information
In the absence of a BIOS to handle INT13 requests, this information
must be constructed artificially instead when booted from EFI.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
--- a/xen/arch/x86/boot/edd.S
+++ b/xen/arch/x86/boot/edd.S
@@ -16,21 +16,13 @@
* Updated and ported for Xen by Keir Fraser <keir@xensource.com> June 2007
*/
+#include <asm/edd.h>
+
.code16
/* Offset of disc signature in the MBR. */
#define EDD_MBR_SIG_OFFSET 0x1B8
-/* Maximum number of EDD information structures at boot_edd_info. */
-#define EDD_INFO_MAX 6
-
-/* Maximum number of MBR signatures at boot_mbr_signature. */
-#define EDD_MBR_SIG_MAX 16
-
-/* Size of components of EDD information structure. */
-#define EDDEXTSIZE 8
-#define EDDPARMSIZE 74
-
get_edd:
cmpb $2, bootsym(opt_edd) # edd=off ?
je edd_done
--- a/xen/arch/x86/efi/boot.c
+++ b/xen/arch/x86/efi/boot.c
@@ -16,6 +16,7 @@
#include <xen/stringify.h>
#include <xen/vga.h>
#include <asm/e820.h>
+#include <asm/edd.h>
#include <asm/mm.h>
#include <asm/msr.h>
#include <asm/processor.h>
@@ -539,6 +540,18 @@ static void __init split_value(char *s)
*s = 0;
}
+static void __init edd_put_string(u8 *dst, size_t n, const char *src)
+{
+ while ( n-- && *src )
+ *dst++ = *src++;
+ if ( *src )
+ PrintErrMesg(L"Internal error populating EDD info",
+ EFI_BUFFER_TOO_SMALL);
+ while ( n-- )
+ *dst++ = ' ';
+}
+#define edd_put_string(d, s) edd_put_string(d, ARRAY_SIZE(d), s)
+
static int __init set_color(u32 mask, int bpp, u8 *pos, u8 *sz)
{
if ( bpp < 0 )
@@ -607,6 +620,8 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SY
{
static EFI_GUID __initdata loaded_image_guid = LOADED_IMAGE_PROTOCOL;
static EFI_GUID __initdata gop_guid = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID;
+ static EFI_GUID __initdata bio_guid = BLOCK_IO_PROTOCOL;
+ static EFI_GUID __initdata devp_guid = DEVICE_PATH_PROTOCOL;
EFI_LOADED_IMAGE *loaded_image;
EFI_STATUS status;
unsigned int i, argc;
@@ -887,7 +902,148 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SY
place_string(&mbi.mem_upper, NULL);
- /* XXX Collect EDD info. */
+ /* Collect EDD info. */
+ BUILD_BUG_ON(offsetof(struct edd_info, edd_device_params) != EDDEXTSIZE);
+ BUILD_BUG_ON(sizeof(struct edd_device_params) != EDDPARMSIZE);
+ size = 0;
+ status = efi_bs->LocateHandle(ByProtocol, &bio_guid, NULL, &size, NULL);
+ if ( status == EFI_BUFFER_TOO_SMALL )
+ status = efi_bs->AllocatePool(EfiLoaderData, size, (void **)&handles);
+ if ( !EFI_ERROR(status) )
+ status = efi_bs->LocateHandle(ByProtocol, &bio_guid, NULL, &size,
+ handles);
+ if ( EFI_ERROR(status) )
+ size = 0;
+ for ( i = 0; i < size / sizeof(*handles); ++i )
+ {
+ EFI_BLOCK_IO *bio;
+ EFI_DEV_PATH_PTR devp;
+ struct edd_info *info = boot_edd_info + boot_edd_info_nr;
+ struct edd_device_params *params = &info->edd_device_params;
+ enum { root, acpi, pci, ctrlr } state = root;
+
+ status = efi_bs->HandleProtocol(handles[i], &bio_guid, (void **)&bio);
+ if ( EFI_ERROR(status) ||
+ bio->Media->RemovableMedia ||
+ bio->Media->LogicalPartition )
+ continue;
+ if ( boot_edd_info_nr < EDD_INFO_MAX )
+ {
+ info->device = 0x80 + boot_edd_info_nr; /* fake */
+ info->version = 0x11;
+ params->length = offsetof(struct edd_device_params, dpte_ptr);
+ params->number_of_sectors = bio->Media->LastBlock + 1;
+ params->bytes_per_sector = bio->Media->BlockSize;
+ params->dpte_ptr = ~0;
+ }
+ ++boot_edd_info_nr;
+ status = efi_bs->HandleProtocol(handles[i], &devp_guid,
+ (void **)&devp);
+ if ( EFI_ERROR(status) )
+ continue;
+ for ( ; !IsDevicePathEnd(devp.DevPath);
+ devp.DevPath = NextDevicePathNode(devp.DevPath) )
+ {
+ switch ( DevicePathType(devp.DevPath) )
+ {
+ const u8 *p;
+
+ case ACPI_DEVICE_PATH:
+ if ( state != root || boot_edd_info_nr > EDD_INFO_MAX )
+ break;
+ switch ( DevicePathSubType(devp.DevPath) )
+ {
+ case ACPI_DP:
+ if ( devp.Acpi->HID != EISA_PNP_ID(0xA03) &&
+ devp.Acpi->HID != EISA_PNP_ID(0xA08) )
+ break;
+ params->interface_path.pci.bus = devp.Acpi->UID;
+ state = acpi;
+ break;
+ case EXPANDED_ACPI_DP:
+ /* XXX */
+ break;
+ }
+ break;
+ case HARDWARE_DEVICE_PATH:
+ if ( state != acpi ||
+ DevicePathSubType(devp.DevPath) != HW_PCI_DP ||
+ boot_edd_info_nr > EDD_INFO_MAX )
+ break;
+ state = pci;
+ edd_put_string(params->host_bus_type, "PCI");
+ params->interface_path.pci.slot = devp.Pci->Device;
+ params->interface_path.pci.function = devp.Pci->Function;
+ break;
+ case MESSAGING_DEVICE_PATH:
+ if ( state != pci || boot_edd_info_nr > EDD_INFO_MAX )
+ break;
+ state = ctrlr;
+ switch ( DevicePathSubType(devp.DevPath) )
+ {
+ case MSG_ATAPI_DP:
+ edd_put_string(params->interface_type, "ATAPI");
+ params->interface_path.pci.channel =
+ devp.Atapi->PrimarySecondary;
+ params->device_path.atapi.device = devp.Atapi->SlaveMaster;
+ params->device_path.atapi.lun = devp.Atapi->Lun;
+ break;
+ case MSG_SCSI_DP:
+ edd_put_string(params->interface_type, "SCSI");
+ params->device_path.scsi.id = devp.Scsi->Pun;
+ params->device_path.scsi.lun = devp.Scsi->Lun;
+ break;
+ case MSG_FIBRECHANNEL_DP:
+ edd_put_string(params->interface_type, "FIBRE");
+ params->device_path.fibre.wwid = devp.FibreChannel->WWN;
+ params->device_path.fibre.lun = devp.FibreChannel->Lun;
+ break;
+ case MSG_1394_DP:
+ edd_put_string(params->interface_type, "1394");
+ params->device_path.i1394.eui = devp.F1394->Guid;
+ break;
+ case MSG_USB_DP:
+ case MSG_USB_CLASS_DP:
+ edd_put_string(params->interface_type, "USB");
+ break;
+ case MSG_I2O_DP:
+ edd_put_string(params->interface_type, "I2O");
+ params->device_path.i2o.identity_tag = devp.I2O->Tid;
+ break;
+ default:
+ continue;
+ }
+ info->version = 0x30;
+ params->length = sizeof(struct edd_device_params);
+ params->key = 0xbedd;
+ params->device_path_info_length =
+ sizeof(struct edd_device_params) -
+ offsetof(struct edd_device_params, key);
+ for ( p = (const u8 *)&params->key; p < &params->checksum; ++p )
+ params->checksum -= *p;
+ break;
+ case MEDIA_DEVICE_PATH:
+ if ( DevicePathSubType(devp.DevPath) == MEDIA_HARDDRIVE_DP &&
+ devp.HardDrive->MBRType == MBR_TYPE_PCAT &&
+ boot_mbr_signature_nr < EDD_MBR_SIG_MAX )
+ {
+ struct mbr_signature *sig = boot_mbr_signature +
+ boot_mbr_signature_nr;
+
+ sig->device = 0x80 + boot_edd_info_nr; /* fake */
+ memcpy(&sig->signature, devp.HardDrive->Signature,
+ sizeof(sig->signature));
+ ++boot_mbr_signature_nr;
+ }
+ break;
+ }
+ }
+ }
+ if ( handles )
+ efi_bs->FreePool(handles);
+ if ( boot_edd_info_nr > EDD_INFO_MAX )
+ boot_edd_info_nr = EDD_INFO_MAX;
+
/* XXX Collect EDID info. */
if ( cpuid_eax(0x80000000) > 0x80000000 )
--- a/xen/include/asm-x86/edd.h
+++ b/xen/include/asm-x86/edd.h
@@ -23,6 +23,8 @@
#ifndef __XEN_EDD_H__
#define __XEN_EDD_H__
+#ifndef __ASSEMBLY__
+
struct edd_info {
/* Int13, Fn48: Check Extensions Present. */
u8 device; /* %dl: device */
@@ -33,10 +35,106 @@ struct edd_info {
u8 legacy_max_head; /* %dh: maximum head number */
u8 legacy_sectors_per_track; /* %cl[5:0]: maximum sector number */
/* Int13, Fn41: Get Device Parameters (as filled into %ds:%esi). */
- struct {
+ struct edd_device_params {
u16 length;
- u8 data[72];
- } edd_device_params;
+ u16 info_flags;
+ u32 num_default_cylinders;
+ u32 num_default_heads;
+ u32 sectors_per_track;
+ u64 number_of_sectors;
+ u16 bytes_per_sector;
+ u32 dpte_ptr; /* 0xFFFFFFFF for our purposes */
+ u16 key; /* = 0xBEDD */
+ u8 device_path_info_length;
+ u8 reserved2;
+ u16 reserved3;
+ u8 host_bus_type[4];
+ u8 interface_type[8];
+ union {
+ struct {
+ u16 base_address;
+ u16 reserved1;
+ u32 reserved2;
+ } __attribute__ ((packed)) isa;
+ struct {
+ u8 bus;
+ u8 slot;
+ u8 function;
+ u8 channel;
+ u32 reserved;
+ } __attribute__ ((packed)) pci;
+ /* pcix is same as pci */
+ struct {
+ u64 reserved;
+ } __attribute__ ((packed)) ibnd;
+ struct {
+ u64 reserved;
+ } __attribute__ ((packed)) xprs;
+ struct {
+ u64 reserved;
+ } __attribute__ ((packed)) htpt;
+ struct {
+ u64 reserved;
+ } __attribute__ ((packed)) unknown;
+ } interface_path;
+ union {
+ struct {
+ u8 device;
+ u8 reserved1;
+ u16 reserved2;
+ u32 reserved3;
+ u64 reserved4;
+ } __attribute__ ((packed)) ata;
+ struct {
+ u8 device;
+ u8 lun;
+ u8 reserved1;
+ u8 reserved2;
+ u32 reserved3;
+ u64 reserved4;
+ } __attribute__ ((packed)) atapi;
+ struct {
+ u16 id;
+ u64 lun;
+ u16 reserved1;
+ u32 reserved2;
+ } __attribute__ ((packed)) scsi;
+ struct {
+ u64 serial_number;
+ u64 reserved;
+ } __attribute__ ((packed)) usb;
+ struct {
+ u64 eui;
+ u64 reserved;
+ } __attribute__ ((packed)) i1394;
+ struct {
+ u64 wwid;
+ u64 lun;
+ } __attribute__ ((packed)) fibre;
+ struct {
+ u64 identity_tag;
+ u64 reserved;
+ } __attribute__ ((packed)) i2o;
+ struct {
+ u32 array_number;
+ u32 reserved1;
+ u64 reserved2;
+ } __attribute__ ((packed)) raid;
+ struct {
+ u8 device;
+ u8 reserved1;
+ u16 reserved2;
+ u32 reserved3;
+ u64 reserved4;
+ } __attribute__ ((packed)) sata;
+ struct {
+ u64 reserved1;
+ u64 reserved2;
+ } __attribute__ ((packed)) unknown;
+ } device_path;
+ u8 reserved4;
+ u8 checksum;
+ } __attribute__ ((packed)) edd_device_params;
} __attribute__ ((packed));
struct mbr_signature {
@@ -51,4 +149,16 @@ extern u8 boot_mbr_signature_nr;
extern struct edd_info boot_edd_info[];
extern u8 boot_edd_info_nr;
+#endif /* __ASSEMBLY__ */
+
+/* Maximum number of EDD information structures at boot_edd_info. */
+#define EDD_INFO_MAX 6
+
+/* Maximum number of MBR signatures at boot_mbr_signature. */
+#define EDD_MBR_SIG_MAX 16
+
+/* Size of components of EDD information structure. */
+#define EDDEXTSIZE 8
+#define EDDPARMSIZE 74
+
#endif /* __XEN_EDD_H__ */

View File

@ -0,0 +1,56 @@
# HG changeset patch
# User Andrew Cooper <andrew.cooper3@citrix.com>
# Date 1313744302 -3600
# Node ID 0ddb4481f883ddf55c12a0b8d1445cf137ef0b63
# Parent 9957bef3e7b4511f83ed8883cd5ecd49ea3ee95d
x86/KEXEC: disable hpet legacy broadcasts earlier
On x2apic machines which booted in xapic mode,
hpet_disable_legacy_broadcast() sends an event check IPI to all online
processors. This leads to a protection fault as the genapic blindly
pokes x2apic MSRs while the local apic is in xapic mode.
One option is to change genapic when we shut down the local apic, but
there are still problems with trying to IPI processors in the online
processor map which are actually sitting in NMI loops
Another option is to have each CPU take itself out of the online CPU
map during the NMI shootdown.
Realistically however, disabling hpet legacy broadcasts earlier in the
kexec path is the easiest fix to the problem.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/crash.c
+++ b/xen/arch/x86/crash.c
@@ -27,6 +27,7 @@
#include <asm/hvm/support.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
+#include <asm/hpet.h>
static atomic_t waiting_for_crash_ipi;
static unsigned int crashing_cpu;
@@ -59,6 +60,9 @@ static void nmi_shootdown_cpus(void)
local_irq_disable();
+ if ( hpet_broadcast_is_available() )
+ hpet_disable_legacy_broadcast();
+
crashing_cpu = smp_processor_id();
local_irq_count(crashing_cpu) = 0;
--- a/xen/arch/x86/machine_kexec.c
+++ b/xen/arch/x86/machine_kexec.c
@@ -96,9 +96,6 @@ void machine_kexec(xen_kexec_image_t *im
.limit = LAST_RESERVED_GDT_BYTE
};
- if ( hpet_broadcast_is_available() )
- hpet_disable_legacy_broadcast();
-
/*
* compat_machine_kexec() returns to idle pagetables, which requires us
* to be running on a static GDT mapping (idle pagetables have no GDT

View File

@ -0,0 +1,68 @@
# HG changeset patch
# User Jan Beulich <jbeulich@novell.com>
# Date 1314004239 -3600
# Node ID 0849b0e59e2418e8215616df147f955b01b07577
# Parent 07f78b5bd03c02e32324eaa00487643d27b7ffa8
pm: don't truncate processors' ACPI IDs to 8 bits
This is just another adjustment to allow systems with very many CPUs
(or unusual ACPI IDs) to be properly power-managed.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
--- a/xen/arch/ia64/linux-xen/acpi.c
+++ b/xen/arch/ia64/linux-xen/acpi.c
@@ -223,11 +223,14 @@ static u16 ia64_acpiid_to_sapicid[ MAX_L
{[0 ... MAX_LOCAL_SAPIC - 1] = 0xffff };
/* acpi id to cpu id */
-int get_cpu_id(u8 acpi_id)
+int get_cpu_id(u32 acpi_id)
{
int i;
u16 apic_id;
+ if ( acpi_id >= MAX_LOCAL_SAPIC )
+ return -EINVAL;
+
apic_id = ia64_acpiid_to_sapicid[acpi_id];
if ( apic_id == 0xffff )
return -EINVAL;
--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -871,11 +871,14 @@ static void set_cx(
acpi_power->safe_state = cx;
}
-int get_cpu_id(u8 acpi_id)
+int get_cpu_id(u32 acpi_id)
{
int i;
u32 apic_id;
+ if ( acpi_id >= MAX_MADT_ENTRIES )
+ return -1;
+
apic_id = x86_acpiid_to_apicid[acpi_id];
if ( apic_id == BAD_APICID )
return -1;
@@ -952,7 +955,7 @@ long set_cx_pminfo(uint32_t cpu, struct
print_cx_pminfo(cpu, power);
/* map from acpi_id to cpu_id */
- cpu_id = get_cpu_id((u8)cpu);
+ cpu_id = get_cpu_id(cpu);
if ( cpu_id == -1 )
{
printk(XENLOG_ERR "no cpu_id for acpi_id %d\n", cpu);
--- a/xen/include/acpi/cpufreq/processor_perf.h
+++ b/xen/include/acpi/cpufreq/processor_perf.h
@@ -6,7 +6,7 @@
#define XEN_PX_INIT 0x80000000
-int get_cpu_id(u8);
+int get_cpu_id(u32);
int powernow_cpufreq_init(void);
unsigned int powernow_register_driver(void);
unsigned int get_measured_perf(unsigned int cpu, unsigned int flag);

View File

@ -0,0 +1,71 @@
References: bnc#701686
# HG changeset patch
# User Jan Beulich <jbeulich@novell.com>
# Date 1314004270 -3600
# Node ID 25dfe53bb1898b3967ceb71a7eb60a8b760c25fb
# Parent 0849b0e59e2418e8215616df147f955b01b07577
x86/IO-APIC: clear remoteIRR in clear_IO_APIC_pin()
It was found that in a crash scenario, the remoteIRR bit in an IO-APIC
RTE could be left set, causing problems when bringing up a kdump
kernel. While this generally is most important to be taken care of in
the new kernel (which usually would be a native one), it still seems
desirable to also address this problem in Xen so that (a) the problem
doesn't bite Xen when used as a secondary emergency kernel and (b) an
attempt is being made to save un-fixed secondary kernels from running
into said problem.
Based on a Linux patch from suresh.b.siddha@intel.com.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -375,11 +375,46 @@ static void clear_IO_APIC_pin(unsigned i
return;
/*
+ * Make sure the entry is masked and re-read the contents to check
+ * if it is a level triggered pin and if the remoteIRR is set.
+ */
+ if (!entry.mask) {
+ entry.mask = 1;
+ __ioapic_write_entry(apic, pin, FALSE, entry);
+ }
+ entry = __ioapic_read_entry(apic, pin, TRUE);
+
+ if (entry.irr) {
+ /* Make sure the trigger mode is set to level. */
+ if (!entry.trigger) {
+ entry.trigger = 1;
+ __ioapic_write_entry(apic, pin, TRUE, entry);
+ }
+ if (mp_ioapics[apic].mpc_apicver >= 0x20)
+ io_apic_eoi(apic, entry.vector);
+ else {
+ /*
+ * Mechanism by which we clear remoteIRR in this case is by
+ * changing the trigger mode to edge and back to level.
+ */
+ entry.trigger = 0;
+ __ioapic_write_entry(apic, pin, TRUE, entry);
+ entry.trigger = 1;
+ __ioapic_write_entry(apic, pin, TRUE, entry);
+ }
+ }
+
+ /*
* Disable it in the IO-APIC irq-routing table:
*/
memset(&entry, 0, sizeof(entry));
entry.mask = 1;
__ioapic_write_entry(apic, pin, TRUE, entry);
+
+ entry = __ioapic_read_entry(apic, pin, TRUE);
+ if (entry.irr)
+ printk(KERN_ERR "IO-APIC%02x-%u: Unable to reset IRR\n",
+ IO_APIC_ID(apic), pin);
}
static void clear_IO_APIC (void)

View File

@ -0,0 +1,266 @@
# HG changeset patch
# User Jan Beulich <jbeulich@novell.com>
# Date 1314004356 -3600
# Node ID 2029263c501c315fa4d94845e5cfa6a9b0b395d5
# Parent 25dfe53bb1898b3967ceb71a7eb60a8b760c25fb
ACPI: add _PDC input override mechanism
In order to have Dom0 call _PDC with input fully representing Xen's
capabilities, and in order to avoid building knowledge of Xen
implementation details into Dom0, this provides a mechanism by which
the Dom0 kernel can, once it filled the _PDC input buffer according to
its own knowledge, present the buffer to Xen to apply overrides for
the parts of the C-, P-, and T-state management that it controls. This
is particularly to address the dependency of Xen using MWAIT to enter
certain C-states on the availability of the break-on-interrupt
extension (which the Dom0 kernel should have no need to know about).
Signed-off-by: Jan Beulich <jbeulich@novell.com>
--- a/xen/arch/ia64/linux-xen/acpi.c
+++ b/xen/arch/ia64/linux-xen/acpi.c
@@ -243,6 +243,13 @@ int get_cpu_id(u32 acpi_id)
return -1;
}
+
+int arch_acpi_set_pdc_bits(u32 acpi_id, u32 *pdc, u32 mask)
+{
+ pdc[2] |= ACPI_PDC_EST_CAPABILITY_SMP & mask;
+ return 0;
+}
+
#endif
static int __init
--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -619,12 +619,6 @@ static int init_cx_pminfo(struct acpi_pr
return 0;
}
-#define CPUID_MWAIT_LEAF (5)
-#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
-#define CPUID5_ECX_INTERRUPT_BREAK (0x2)
-
-#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
-
#define MWAIT_SUBSTATE_MASK (0xf)
#define MWAIT_SUBSTATE_SIZE (4)
--- a/xen/arch/x86/acpi/boot.c
+++ b/xen/arch/x86/acpi/boot.c
@@ -1006,3 +1006,47 @@ unsigned int acpi_get_processor_id(unsig
return INVALID_ACPIID;
}
+
+static void get_mwait_ecx(void *info)
+{
+ *(u32 *)info = cpuid_ecx(CPUID_MWAIT_LEAF);
+}
+
+int arch_acpi_set_pdc_bits(u32 acpi_id, u32 *pdc, u32 mask)
+{
+ unsigned int cpu = get_cpu_id(acpi_id);
+ struct cpuinfo_x86 *c;
+ u32 ecx;
+
+ if (!(acpi_id + 1))
+ c = &boot_cpu_data;
+ else if (cpu >= NR_CPUS || !cpu_online(cpu))
+ return -EINVAL;
+ else
+ c = cpu_data + cpu;
+
+ pdc[2] |= ACPI_PDC_C_CAPABILITY_SMP & mask;
+
+ if (cpu_has(c, X86_FEATURE_EST))
+ pdc[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP & mask;
+
+ if (cpu_has(c, X86_FEATURE_ACPI))
+ pdc[2] |= ACPI_PDC_T_FFH & mask;
+
+ /*
+ * If mwait/monitor or its break-on-interrupt extension are
+ * unsupported, Cx_FFH will be disabled.
+ */
+ if (!cpu_has(c, X86_FEATURE_MWAIT) ||
+ c->cpuid_level < CPUID_MWAIT_LEAF)
+ ecx = 0;
+ else if (c == &boot_cpu_data || cpu == smp_processor_id())
+ ecx = cpuid_ecx(CPUID_MWAIT_LEAF);
+ else
+ on_selected_cpus(cpumask_of(cpu), get_mwait_ecx, &ecx, 1);
+ if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
+ !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
+ pdc[2] &= ~(ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH);
+
+ return 0;
+}
--- a/xen/arch/x86/platform_hypercall.c
+++ b/xen/arch/x86/platform_hypercall.c
@@ -419,6 +419,15 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
ret = -EINVAL;
break;
+ case XEN_PM_PDC:
+ {
+ XEN_GUEST_HANDLE(uint32) pdc;
+
+ guest_from_compat_handle(pdc, op->u.set_pminfo.u.pdc);
+ ret = acpi_set_pdc_bits(op->u.set_pminfo.id, pdc);
+ }
+ break;
+
default:
ret = -EINVAL;
break;
--- a/xen/drivers/acpi/pmstat.c
+++ b/xen/drivers/acpi/pmstat.c
@@ -519,3 +519,34 @@ int do_pm_op(struct xen_sysctl_pm_op *op
return ret;
}
+
+int acpi_set_pdc_bits(u32 acpi_id, XEN_GUEST_HANDLE(uint32) pdc)
+{
+ u32 bits[3];
+ int ret;
+
+ if ( copy_from_guest(bits, pdc, 2) )
+ ret = -EFAULT;
+ else if ( bits[0] != ACPI_PDC_REVISION_ID || !bits[1] )
+ ret = -EINVAL;
+ else if ( copy_from_guest_offset(bits + 2, pdc, 2, 1) )
+ ret = -EFAULT;
+ else
+ {
+ u32 mask = 0;
+
+ if ( xen_processor_pmbits & XEN_PROCESSOR_PM_CX )
+ mask |= ACPI_PDC_C_MASK | ACPI_PDC_SMP_C1PT;
+ if ( xen_processor_pmbits & XEN_PROCESSOR_PM_PX )
+ mask |= ACPI_PDC_P_MASK | ACPI_PDC_SMP_C1PT;
+ if ( xen_processor_pmbits & XEN_PROCESSOR_PM_TX )
+ mask |= ACPI_PDC_T_MASK | ACPI_PDC_SMP_C1PT;
+ bits[2] &= (ACPI_PDC_C_MASK | ACPI_PDC_P_MASK | ACPI_PDC_T_MASK |
+ ACPI_PDC_SMP_C1PT) & ~mask;
+ ret = arch_acpi_set_pdc_bits(acpi_id, bits, mask);
+ }
+ if ( !ret )
+ ret = copy_to_guest_offset(pdc, 2, bits + 2, 1);
+
+ return ret;
+}
--- a/xen/include/acpi/cpufreq/processor_perf.h
+++ b/xen/include/acpi/cpufreq/processor_perf.h
@@ -3,10 +3,10 @@
#include <public/platform.h>
#include <public/sysctl.h>
+#include <xen/acpi.h>
#define XEN_PX_INIT 0x80000000
-int get_cpu_id(u32);
int powernow_cpufreq_init(void);
unsigned int powernow_register_driver(void);
unsigned int get_measured_perf(unsigned int cpu, unsigned int flag);
--- a/xen/include/acpi/pdc_intel.h
+++ b/xen/include/acpi/pdc_intel.h
@@ -4,6 +4,8 @@
#ifndef __PDC_INTEL_H__
#define __PDC_INTEL_H__
+#define ACPI_PDC_REVISION_ID 1
+
#define ACPI_PDC_P_FFH (0x0001)
#define ACPI_PDC_C_C1_HALT (0x0002)
#define ACPI_PDC_T_FFH (0x0004)
@@ -14,6 +16,7 @@
#define ACPI_PDC_SMP_T_SWCOORD (0x0080)
#define ACPI_PDC_C_C1_FFH (0x0100)
#define ACPI_PDC_C_C2C3_FFH (0x0200)
+#define ACPI_PDC_SMP_P_HWCOORD (0x0800)
#define ACPI_PDC_EST_CAPABILITY_SMP (ACPI_PDC_SMP_C1PT | \
ACPI_PDC_C_C1_HALT | \
@@ -22,6 +25,7 @@
#define ACPI_PDC_EST_CAPABILITY_SWSMP (ACPI_PDC_SMP_C1PT | \
ACPI_PDC_C_C1_HALT | \
ACPI_PDC_SMP_P_SWCOORD | \
+ ACPI_PDC_SMP_P_HWCOORD | \
ACPI_PDC_P_FFH)
#define ACPI_PDC_C_CAPABILITY_SMP (ACPI_PDC_SMP_C2C3 | \
@@ -30,4 +34,17 @@
ACPI_PDC_C_C1_FFH | \
ACPI_PDC_C_C2C3_FFH)
+#define ACPI_PDC_C_MASK (ACPI_PDC_C_C1_HALT | \
+ ACPI_PDC_C_C1_FFH | \
+ ACPI_PDC_SMP_C2C3 | \
+ ACPI_PDC_SMP_C_SWCOORD | \
+ ACPI_PDC_C_C2C3_FFH)
+
+#define ACPI_PDC_P_MASK (ACPI_PDC_P_FFH | \
+ ACPI_PDC_SMP_P_SWCOORD | \
+ ACPI_PDC_SMP_P_HWCOORD)
+
+#define ACPI_PDC_T_MASK (ACPI_PDC_T_FFH | \
+ ACPI_PDC_SMP_T_SWCOORD)
+
#endif /* __PDC_INTEL_H__ */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -150,6 +150,10 @@
#define boot_cpu_has(bit) test_bit(bit, boot_cpu_data.x86_capability)
#define cpufeat_mask(idx) (1u << ((idx) & 31))
+#define CPUID_MWAIT_LEAF 5
+#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1
+#define CPUID5_ECX_INTERRUPT_BREAK 0x2
+
#ifdef __i386__
#define cpu_has_vme boot_cpu_has(X86_FEATURE_VME)
#define cpu_has_de boot_cpu_has(X86_FEATURE_DE)
--- a/xen/include/public/platform.h
+++ b/xen/include/public/platform.h
@@ -304,6 +304,7 @@ DEFINE_XEN_GUEST_HANDLE(xenpf_getidletim
#define XEN_PM_CX 0
#define XEN_PM_PX 1
#define XEN_PM_TX 2
+#define XEN_PM_PDC 3
/* Px sub info type */
#define XEN_PX_PCT 1
@@ -401,6 +402,7 @@ struct xenpf_set_processor_pminfo {
union {
struct xen_processor_power power;/* Cx: _CST/_CSD */
struct xen_processor_performance perf; /* Px: _PPC/_PCT/_PSS/_PSD */
+ XEN_GUEST_HANDLE(uint32) pdc; /* _PDC */
} u;
};
typedef struct xenpf_set_processor_pminfo xenpf_set_processor_pminfo_t;
--- a/xen/include/xen/acpi.h
+++ b/xen/include/xen/acpi.h
@@ -334,6 +334,8 @@ static inline int acpi_boot_table_init(v
#endif /*!CONFIG_ACPI_BOOT*/
+int get_cpu_id(u32 acpi_id);
+
unsigned int acpi_register_gsi (u32 gsi, int edge_level, int active_high_low);
int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
@@ -431,6 +433,9 @@ static inline unsigned int acpi_get_csta
static inline void acpi_set_cstate_limit(unsigned int new_limit) { return; }
#endif
+int acpi_set_pdc_bits(u32 acpi_id, XEN_GUEST_HANDLE(uint32));
+int arch_acpi_set_pdc_bits(u32 acpi_id, u32 *, u32 mask);
+
#ifdef CONFIG_ACPI_NUMA
int acpi_get_pxm(acpi_handle handle);
#else

View File

@ -2,7 +2,7 @@ Index: xen-4.1.1-testing/tools/python/xen/xend/XendDomainInfo.py
===================================================================
--- xen-4.1.1-testing.orig/tools/python/xen/xend/XendDomainInfo.py
+++ xen-4.1.1-testing/tools/python/xen/xend/XendDomainInfo.py
@@ -2927,7 +2927,7 @@ class XendDomainInfo:
@@ -2926,7 +2926,7 @@ class XendDomainInfo:
self.guest_bitsize = self.image.getBitSize()
# Make sure there's enough RAM available for the domain

View File

@ -63,7 +63,7 @@ Index: xen-4.1.1-testing/tools/python/xen/xend/XendDomainInfo.py
===================================================================
--- xen-4.1.1-testing.orig/tools/python/xen/xend/XendDomainInfo.py
+++ xen-4.1.1-testing/tools/python/xen/xend/XendDomainInfo.py
@@ -1489,6 +1489,20 @@ class XendDomainInfo:
@@ -1488,6 +1488,20 @@ class XendDomainInfo:
target = max_target
self.setMemoryTarget(target)

View File

@ -2,7 +2,7 @@ Index: xen-4.1.1-testing/tools/python/xen/xend/XendDomainInfo.py
===================================================================
--- xen-4.1.1-testing.orig/tools/python/xen/xend/XendDomainInfo.py
+++ xen-4.1.1-testing/tools/python/xen/xend/XendDomainInfo.py
@@ -3130,6 +3130,11 @@ class XendDomainInfo:
@@ -3129,6 +3129,11 @@ class XendDomainInfo:
self._cleanup_phantom_devs(paths)
self._cleanupVm()

View File

@ -2,7 +2,7 @@ Index: xen-4.1.1-testing/tools/python/xen/xend/XendDomainInfo.py
===================================================================
--- xen-4.1.1-testing.orig/tools/python/xen/xend/XendDomainInfo.py
+++ xen-4.1.1-testing/tools/python/xen/xend/XendDomainInfo.py
@@ -1297,8 +1297,15 @@ class XendDomainInfo:
@@ -1296,8 +1296,15 @@ class XendDomainInfo:
frontpath = self.getDeviceController(deviceClass).frontendPath(dev)
backpath = xstransact.Read(frontpath, "backend")
thread.start_new_thread(self.getDeviceController(deviceClass).finishDeviceCleanup, (backpath, path))

View File

@ -0,0 +1,74 @@
Print a warning and exit xl if xend is running. It is not
recommened to use libxenlight in conjunction with legacy xend
toolstack.
xl could be useful even when xend is running, e.g. to debug
xend itself, so add a '-f' option to override the exit.
Index: xen-4.1.1-testing/tools/libxl/xl.c
===================================================================
--- xen-4.1.1-testing.orig/tools/libxl/xl.c
+++ xen-4.1.1-testing/tools/libxl/xl.c
@@ -88,12 +88,16 @@ int main(int argc, char **argv)
char *config_file;
void *config_data = 0;
int config_len = 0;
+ int force = 0;
- while ((opt = getopt(argc, argv, "+v")) >= 0) {
+ while ((opt = getopt(argc, argv, "+vf")) >= 0) {
switch (opt) {
case 'v':
if (minmsglevel > 0) minmsglevel--;
break;
+ case 'f':
+ force = 1;
+ break;
default:
fprintf(stderr, "unknown global option\n");
exit(2);
@@ -107,6 +111,22 @@ int main(int argc, char **argv)
exit(1);
}
opterr = 0;
+ /*
+ * On SUSE, if xend is running (and user isn't asking for help),
+ * print a warning and exit unless forced.
+ */
+ if ((system("/usr/sbin/xend status") == 0) && strcmp(cmd, "help")) {
+ if (force == 0) {
+ fprintf(stderr, "WARNING: xend is running! It is not recommended "
+ "using libxenlight in\nconjunction with the legacy xend "
+ "toolstack. Use -f (force) to override\n");
+ exit(1);
+ } else {
+ fprintf(stderr, "WARNING: xend is running! It is not recommended "
+ "using libxenlight in\nconjunction with the legacy xend "
+ "toolstack.\n\n");
+ }
+ }
logger = xtl_createlogger_stdiostream(stderr, minmsglevel, 0);
if (!logger) exit(1);
Index: xen-4.1.1-testing/tools/libxl/xl_cmdimpl.c
===================================================================
--- xen-4.1.1-testing.orig/tools/libxl/xl_cmdimpl.c
+++ xen-4.1.1-testing/tools/libxl/xl_cmdimpl.c
@@ -1725,7 +1725,7 @@ void help(const char *command)
struct cmd_spec *cmd;
if (!command || !strcmp(command, "help")) {
- printf("Usage xl [-v] <subcommand> [args]\n\n");
+ printf("Usage xl [-v] [-f] <subcommand> [args]\n\n");
printf("xl full list of subcommands:\n\n");
for (i = 0; i < cmdtable_len; i++)
printf(" %-20s%s\n",
@@ -1733,7 +1733,7 @@ void help(const char *command)
} else {
cmd = cmdtable_lookup(command);
if (cmd) {
- printf("Usage: xl [-v] %s %s\n\n%s.\n\n",
+ printf("Usage: xl [-v] [-f] %s %s\n\n%s.\n\n",
cmd->cmd_name,
cmd->cmd_usage,
cmd->cmd_desc);

View File

@ -18,7 +18,7 @@ Index: xen-4.1.1-testing/tools/python/xen/xend/XendDomainInfo.py
xc = xen.lowlevel.xc.xc()
xoptions = XendOptions.instance()
@@ -3299,33 +3299,38 @@ class XendDomainInfo:
@@ -3298,33 +3298,38 @@ class XendDomainInfo:
# This is a file, not a device. pygrub can cope with a
# file if it's raw, but if it's QCOW or other such formats
# used through blktap, then we need to mount it first.

View File

@ -690,15 +690,16 @@ Index: xen-4.1.1-testing/tools/python/xen/xend/XendDomainInfo.py
===================================================================
--- xen-4.1.1-testing.orig/tools/python/xen/xend/XendDomainInfo.py
+++ xen-4.1.1-testing/tools/python/xen/xend/XendDomainInfo.py
@@ -508,7 +508,6 @@ class XendDomainInfo:
@@ -508,8 +508,6 @@ class XendDomainInfo:
self._setSchedParams()
self._storeVmDetails()
self._createChannels()
- self._createDevices()
self._storeDomDetails()
- self._storeDomDetails()
self._endRestore()
except:
@@ -2368,7 +2367,7 @@ class XendDomainInfo:
log.exception('VM resume failed')
@@ -2368,7 +2366,7 @@ class XendDomainInfo:
return self.getDeviceController(deviceClass).reconfigureDevice(
devid, devconfig)
@ -707,7 +708,7 @@ Index: xen-4.1.1-testing/tools/python/xen/xend/XendDomainInfo.py
"""Create the devices for a vm.
@raise: VmError for invalid devices
@@ -2417,7 +2416,7 @@ class XendDomainInfo:
@@ -2417,7 +2415,7 @@ class XendDomainInfo:
if self.image:
@ -716,7 +717,7 @@ Index: xen-4.1.1-testing/tools/python/xen/xend/XendDomainInfo.py
#if have pass-through devs, need the virtual pci slots info from qemu
self.pci_device_configure_boot()
@@ -3043,7 +3042,7 @@ class XendDomainInfo:
@@ -3043,7 +3041,7 @@ class XendDomainInfo:
self._introduceDomain()
self.image = image.create(self, self.info)
if self.image:

View File

@ -2,7 +2,7 @@ Index: xen-4.1.1-testing/tools/hotplug/Linux/init.d/xencommons
===================================================================
--- xen-4.1.1-testing.orig/tools/hotplug/Linux/init.d/xencommons
+++ xen-4.1.1-testing/tools/hotplug/Linux/init.d/xencommons
@@ -45,6 +45,18 @@ do_start () {
@@ -57,6 +57,18 @@ do_start () {
local time=0
local timeout=30

View File

@ -17,7 +17,7 @@
struct xen_platform_op curop, *op = &curop;
if ( !IS_PRIV(current->domain) )
@@ -513,6 +514,24 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
@@ -522,6 +523,24 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
op->u.mem_add.epfn,
op->u.mem_add.pxm);
break;
@ -44,7 +44,7 @@
break;
--- a/xen/include/public/platform.h
+++ b/xen/include/public/platform.h
@@ -449,6 +449,14 @@ struct xenpf_mem_hotadd
@@ -451,6 +451,14 @@ struct xenpf_mem_hotadd
uint32_t flags;
};
@ -59,7 +59,7 @@
struct xen_platform_op {
uint32_t cmd;
uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
@@ -469,6 +477,7 @@ struct xen_platform_op {
@@ -471,6 +479,7 @@ struct xen_platform_op {
struct xenpf_cpu_ol cpu_ol;
struct xenpf_cpu_hotadd cpu_add;
struct xenpf_mem_hotadd mem_add;

View File

@ -1,10 +1,8 @@
Change default IO-APIC ack mode for single IO-APIC systems to old-style.
Index: xen-4.1.1-testing/xen/arch/x86/io_apic.c
===================================================================
--- xen-4.1.1-testing.orig/xen/arch/x86/io_apic.c
+++ xen-4.1.1-testing/xen/arch/x86/io_apic.c
@@ -1547,7 +1547,7 @@ static unsigned int startup_level_ioapic
--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -1578,7 +1578,7 @@ static unsigned int startup_level_ioapic
return 0; /* don't check for pending */
}
@ -13,7 +11,7 @@ Index: xen-4.1.1-testing/xen/arch/x86/io_apic.c
static void setup_ioapic_ack(char *s)
{
if ( !strcmp(s, "old") )
@@ -2044,6 +2044,8 @@ void __init setup_IO_APIC(void)
@@ -2075,6 +2075,8 @@ void __init setup_IO_APIC(void)
else
io_apic_irqs = ~PIC_IRQS;

View File

@ -21,7 +21,7 @@
printk("%p ", _p(*stk++));
--- a/xen/arch/x86/x86_32/mm.c
+++ b/xen/arch/x86/x86_32/mm.c
@@ -121,6 +121,8 @@ void __init paging_init(void)
@@ -122,6 +122,8 @@ void __init paging_init(void)
#undef CNT
#undef MFN

View File

@ -241,7 +241,7 @@
status = fread(&buf, 1, sizeof(*h), rtnl);
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -746,7 +746,7 @@ static void __pci_disable_msi(struct msi
@@ -799,7 +799,7 @@ static void __pci_disable_msi(struct msi
{
struct pci_dev *dev;
int pos;

View File

@ -1,3 +1,66 @@
-------------------------------------------------------------------
Tue Aug 23 08:53:20 MDT 2011 - carnold@novell.com
- Upstream patches from Jan
23725-pci-add-device.patch
23762-iommu-fault-bm-off.patch
23763-pci-multi-seg-x2apic-vtd-no-crash.patch
23765-x86-irq-vector-leak.patch
23766-x86-msi-vf-bars.patch
23771-x86-ioapic-clear-pin.patch
23772-x86-trampoline.patch
23774-x86_64-EFI-EDD.patch
23776-x86-kexec-hpet-legacy-bcast-disable.patch
23781-pm-wide-ACPI-ids.patch
23782-x86-ioapic-clear-irr.patch
23783-ACPI-set-_PDC-bits.patch
-------------------------------------------------------------------
Mon Aug 15 11:54:08 CEST 2011 - ohering@suse.de
- Include gcc46 only when its available (>11.4 && >sles11sp1)
-------------------------------------------------------------------
Fri Aug 12 09:43:23 MDT 2011 - carnold@novell.com
- bnc#711943 - [xl] Fail to create multi-guests with NIC assigned
23685-libxl-segfault-fix.patch
-------------------------------------------------------------------
Thu Aug 11 00:36:17 MDT 2011 - jfehlig@suse.com
- libxenlight and legacy xend toolstack should not be used
together. If xend is running, print a warning and exit
xl. Add a '-f' (force) option to xl to override this
behavior.
disable-xl-when-using-xend.patch
bnc#707664
-------------------------------------------------------------------
Wed Aug 10 09:08:28 MDT 2011 - carnold@novell.com
- Upstream patches from Jan
23732-sedf.patch
23735-guest-dom0-cap.patch
23746-vtd-cleanup-timers.patch
23747-mmcfg-base-address.patch
23749-mmcfg-reservation.patch
-------------------------------------------------------------------
Tue Aug 9 13:38:09 CST 2011 - cyliu@novell.com
- bnc#704160 - crm resource migrate fails with xen machines
update snapshot-xend.patch
- bnc#706574 - xm console DomUName hang after "xm save/restore" of
PVM on the latest Xen
xend-console-port-restore.patch
-------------------------------------------------------------------
Tue Aug 9 16:42:23 CEST 2011 - ohering@suse.de
- update xencommons script to run only when needed
xencommons-proc-xen.patch
-------------------------------------------------------------------
Fri Jul 22 09:34:34 MDT 2011 - carnold@novell.com

107
xen.spec
View File

@ -26,6 +26,13 @@ ExclusiveArch: %ix86 x86_64
%define xen_build_dir xen-4.1.1-testing
%define with_kmp 1
%define with_stubdom 1
# EFI requires gcc46 or newer
# its available in 12.1 or >= sles11sp2
%if %suse_version > 1140 || %suse_version == 1110
%define with_gcc46 1
%else
%define with_gcc46 0
%endif
%ifarch x86_64
%define with_dom0_support 1
%else
@ -72,9 +79,10 @@ BuildRequires: te_latex
BuildRequires: tetex
%endif
%ifarch x86_64
# EFI requires gcc45 or newer
%if %{?with_gcc46}0
BuildRequires: gcc46
BuildRequires: libgcc46 libgcc46-32bit
%endif
BuildRequires: glibc-32bit glibc-devel-32bit
BuildRequires: gcc-32bit
BuildRequires: gcc43-32bit
@ -88,8 +96,8 @@ BuildRequires: glibc-devel
%if %{?with_kmp}0
BuildRequires: kernel-source kernel-syms module-init-tools xorg-x11
%endif
Version: 4.1.1_02
Release: 3
Version: 4.1.1_06
Release: 1
License: GPLv2+
Group: System/Kernel
AutoReqProv: on
@ -174,10 +182,28 @@ Patch40: 23614-x86_64-EFI-boot.patch
Patch41: 23615-x86_64-EFI-runtime.patch
Patch42: 23616-x86_64-EFI-MPS.patch
Patch43: 23676-x86_64-image-map-bounds.patch
Patch44: 23706-fix-20892.patch
Patch45: 23723-x86-CMOS-lock.patch
Patch46: 23724-x86-smpboot-x2apic.patch
Patch47: 23726-x86-intel-flexmigration.patch
Patch44: 23685-libxl-segfault-fix.patch
Patch45: 23706-fix-20892.patch
Patch46: 23723-x86-CMOS-lock.patch
Patch47: 23724-x86-smpboot-x2apic.patch
Patch48: 23725-pci-add-device.patch
Patch49: 23726-x86-intel-flexmigration.patch
Patch50: 23732-sedf.patch
Patch51: 23735-guest-dom0-cap.patch
Patch52: 23746-vtd-cleanup-timers.patch
Patch53: 23747-mmcfg-base-address.patch
Patch54: 23749-mmcfg-reservation.patch
Patch55: 23762-iommu-fault-bm-off.patch
Patch56: 23763-pci-multi-seg-x2apic-vtd-no-crash.patch
Patch57: 23765-x86-irq-vector-leak.patch
Patch58: 23766-x86-msi-vf-bars.patch
Patch59: 23771-x86-ioapic-clear-pin.patch
Patch60: 23772-x86-trampoline.patch
Patch61: 23774-x86_64-EFI-EDD.patch
Patch62: 23776-x86-kexec-hpet-legacy-bcast-disable.patch
Patch63: 23781-pm-wide-ACPI-ids.patch
Patch64: 23782-x86-ioapic-clear-irr.patch
Patch65: 23783-ACPI-set-_PDC-bits.patch
# Upstream qemu patches
# Our patches
Patch300: xen-config.diff
@ -223,16 +249,17 @@ Patch356: ioemu-vnc-resize.patch
Patch357: ioemu-debuginfo.patch
Patch358: vif-bridge-no-iptables.patch
Patch359: xenconsole-no-multiple-connections.patch
Patch360: disable-xl-when-using-xend.patch
# Needs to go upstream
Patch360: checkpoint-rename.patch
Patch361: xm-save-check-file.patch
Patch362: xm-create-xflag.patch
Patch370: xend-sysconfig.patch
Patch371: domu-usb-controller.patch
Patch372: usb-list.patch
Patch373: xend-devid-or-name.patch
Patch374: suspend_evtchn_lock.patch
Patch375: log-guest-console.patch
Patch370: checkpoint-rename.patch
Patch371: xm-save-check-file.patch
Patch372: xm-create-xflag.patch
Patch373: xend-sysconfig.patch
Patch374: domu-usb-controller.patch
Patch375: usb-list.patch
Patch376: xend-devid-or-name.patch
Patch377: suspend_evtchn_lock.patch
Patch378: log-guest-console.patch
# Patches for snapshot support
Patch400: snapshot-ioemu-save.patch
Patch401: snapshot-ioemu-restore.patch
@ -276,6 +303,8 @@ Patch450: ioemu-watchdog-support.patch
Patch451: ioemu-watchdog-linkage.patch
Patch452: ioemu-watchdog-ib700-timer.patch
Patch453: tools-watchdog-support.patch
Patch454: xend-console-port-restore.patch
Patch455: xencommons-proc-xen.patch
# Jim's domain lock patch
Patch480: xend-domain-lock.patch
Patch481: xend-domain-lock-sfex.patch
@ -710,6 +739,24 @@ tar xfj %{SOURCE2} -C $RPM_BUILD_DIR/%{xen_build_dir}/tools
%patch45 -p1
%patch46 -p1
%patch47 -p1
%patch48 -p1
%patch49 -p1
%patch50 -p1
%patch51 -p1
%patch52 -p1
%patch53 -p1
%patch54 -p1
%patch55 -p1
%patch56 -p1
%patch57 -p1
%patch58 -p1
%patch59 -p1
%patch60 -p1
%patch61 -p1
%patch62 -p1
%patch63 -p1
%patch64 -p1
%patch65 -p1
%patch300 -p1
%patch301 -p1
%patch302 -p1
@ -754,14 +801,15 @@ tar xfj %{SOURCE2} -C $RPM_BUILD_DIR/%{xen_build_dir}/tools
%patch358 -p1
%patch359 -p1
%patch360 -p1
%patch361 -p1
%patch362 -p1
%patch370 -p1
%patch371 -p1
%patch372 -p1
%patch373 -p1
#%patch374 -p1 suspend_evtchn_lock, buildservice build problem
%patch374 -p1
%patch375 -p1
%patch376 -p1
#%patch377 -p1 suspend_evtchn_lock, buildservice build problem
%patch378 -p1
%patch400 -p1
%patch401 -p1
%patch402 -p1
@ -802,6 +850,8 @@ tar xfj %{SOURCE2} -C $RPM_BUILD_DIR/%{xen_build_dir}/tools
%patch451 -p1
%patch452 -p1
%patch453 -p1
%patch454 -p1
%patch455 -p1
%patch480 -p1
%patch481 -p1
%patch500 -p1
@ -888,7 +938,11 @@ export CFLAGS="$RPM_OPT_FLAGS"
%if %{?with_dom0_support}0
# EFI
%ifarch x86_64
make -C xen install CC=gcc-4.6 max_phys_cpus=%{max_cpus} debug=n crash_debug=n DESTDIR=$RPM_BUILD_ROOT %{?_smp_mflags}
make -C xen install \
%if %{?with_gcc46}0
CC=gcc-4.6 \
%endif
max_phys_cpus=%{max_cpus} debug=n crash_debug=n DESTDIR=$RPM_BUILD_ROOT %{?_smp_mflags}
make -C xen clean
%endif
install_xen()
@ -928,6 +982,12 @@ make -C tools/misc/serial-split install \
mkdir -p $RPM_BUILD_ROOT/${_libdir}/xen/bin/
ln -s /usr/lib/xen/bin/qemu-dm $RPM_BUILD_ROOT/%{_libdir}/xen/bin/qemu-dm
%endif
# efi depends on gcc46
echo > xen.files.txt
if test -d $RPM_BUILD_ROOT%{_libdir}/efi
then
echo %{_libdir}/efi >> xen.files.txt
fi
cp -avL xenalyze.hg/dump-raw $RPM_BUILD_ROOT/%{_bindir}/xenalyze.dump-raw
cp -avL xenalyze.hg/xenalyze $RPM_BUILD_ROOT/%{_bindir}
%else
@ -1072,7 +1132,7 @@ rm -f $RPM_BUILD_ROOT/%{_bindir}/xencons
%if %{?with_dom0_support}0
%files
%files -f xen.files.txt
%defattr(-,root,root)
/boot/xen-%{version}-%{release}.gz
/boot/xen-%{xvermaj}.gz
@ -1086,11 +1146,6 @@ rm -f $RPM_BUILD_ROOT/%{_bindir}/xencons
/boot/xen-syms-dbg
/boot/xen-syms-dbg-%{version}-%{release}
/boot/xen.gz
# EFI
%ifarch x86_64
%dir %{_libdir}/efi
%{_libdir}/efi/xen*.efi
%endif
%endif
%files libs

55
xencommons-proc-xen.patch Normal file
View File

@ -0,0 +1,55 @@
# HG changeset patch
# Parent ea18090ab6e3cb3c69d232ec0865589688db3f81
hotplug: update xencommons script to run only when needed
Update the xencommons script to run only when needed:
- do not run if /proc/xen does not exist
- check if /proc/xen/capabilities exists before doing the grep for dom0
- use variable for /proc/xen/capabilities
- use grep -q instead of stdout redirection when looking for xenfs,
its already used later
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
tools/hotplug/Linux/init.d/xencommons | 20 ++++++++++++++++----
1 file changed, 16 insertions(+), 4 deletions(-)
Index: xen-4.1.1-testing/tools/hotplug/Linux/init.d/xencommons
===================================================================
--- xen-4.1.1-testing.orig/tools/hotplug/Linux/init.d/xencommons
+++ xen-4.1.1-testing/tools/hotplug/Linux/init.d/xencommons
@@ -27,17 +27,29 @@ fi
test -f $xencommons_config/xencommons && . $xencommons_config/xencommons
XENCONSOLED_PIDFILE=/var/run/xenconsoled.pid
+XEN_CAPABILITIES=/proc/xen/capabilities
shopt -s extglob
+# not running in Xen dom0 or domU
+if ! test -d /proc/xen ; then
+ exit 0
+fi
+
+# mount xenfs in dom0 or domU with a pv_ops kernel
if test "x$1" = xstart && \
- test -d /proc/xen && \
- ! test -f /proc/xen/capabilities && \
- ! grep '^xenfs ' /proc/mounts >/dev/null;
+ ! test -f $XEN_CAPABILITIES && \
+ ! grep -q '^xenfs ' /proc/mounts ;
then
mount -t xenfs xenfs /proc/xen
fi
-if ! grep -q "control_d" /proc/xen/capabilities ; then
+# run this script only in dom0:
+# no capabilities file in xenlinux kernel
+if ! test -f $XEN_CAPABILITIES ; then
+ exit 0
+fi
+# empty capabilities file in pv_ops kernel
+if ! grep -q "control_d" $XEN_CAPABILITIES ; then
exit 0
fi

View File

@ -0,0 +1,40 @@
Pass console_port to completeRestore() so that console/port is written to
xenstore. See bnc#706574
From: Chunyan Liu <cyliu@novell.com>
Index: xen-4.1.1-testing/tools/python/xen/xend/XendCheckpoint.py
===================================================================
--- xen-4.1.1-testing.orig/tools/python/xen/xend/XendCheckpoint.py
+++ xen-4.1.1-testing/tools/python/xen/xend/XendCheckpoint.py
@@ -402,8 +402,7 @@ def restore(xd, fd, dominfo = None, paus
restore_image.setCpuid()
# xc_restore will wait for source to close connection
-
- dominfo.completeRestore(handler.store_mfn, handler.console_mfn)
+ dominfo.completeRestore(handler.store_mfn, handler.console_mfn, console_port)
wait_devs(dominfo)
Index: xen-4.1.1-testing/tools/python/xen/xend/XendDomainInfo.py
===================================================================
--- xen-4.1.1-testing.orig/tools/python/xen/xend/XendDomainInfo.py
+++ xen-4.1.1-testing/tools/python/xen/xend/XendDomainInfo.py
@@ -3052,7 +3052,7 @@ class XendDomainInfo:
# TODO: recategorise - called from XendCheckpoint
#
- def completeRestore(self, store_mfn, console_mfn):
+ def completeRestore(self, store_mfn, console_mfn, console_port):
log.debug("XendDomainInfo.completeRestore")
@@ -3063,6 +3063,7 @@ class XendDomainInfo:
self.image = image.create(self, self.info)
if self.image:
self._createDevices(True)
+ self.console_port = console_port
self._storeDomDetails()
self._registerWatches()
self.refreshShutdown()

View File

@ -237,7 +237,7 @@ Index: xen-4.1.1-testing/tools/python/xen/xend/XendCheckpoint.py
dominfo.migrateDevices(network, dst, DEV_MIGRATE_STEP2,
domain_name)
log.info("Domain %d suspended.", dominfo.getDomid())
@@ -410,6 +412,7 @@ def restore(xd, fd, dominfo = None, paus
@@ -409,6 +411,7 @@ def restore(xd, fd, dominfo = None, paus
if not paused:
dominfo.unpause()
@ -257,7 +257,7 @@ Index: xen-4.1.1-testing/tools/python/xen/xend/XendDomainInfo.py
XendTask.log_progress(0, 30, self._constructDomain)
XendTask.log_progress(31, 60, self._initDomain)
@@ -2998,6 +2999,11 @@ class XendDomainInfo:
@@ -2997,6 +2998,11 @@ class XendDomainInfo:
self._stateSet(DOM_STATE_HALTED)
self.domid = None # Do not push into _stateSet()!

View File

@ -123,7 +123,7 @@ Index: xen-4.1.1-testing/tools/python/xen/xend/XendDomainInfo.py
===================================================================
--- xen-4.1.1-testing.orig/tools/python/xen/xend/XendDomainInfo.py
+++ xen-4.1.1-testing/tools/python/xen/xend/XendDomainInfo.py
@@ -2291,6 +2291,8 @@ class XendDomainInfo:
@@ -2290,6 +2290,8 @@ class XendDomainInfo:
self.info['name_label'], self.domid, self.info['uuid'],
new_name, new_uuid)
self._unwatchVm()
@ -132,7 +132,7 @@ Index: xen-4.1.1-testing/tools/python/xen/xend/XendDomainInfo.py
self._releaseDevices()
# Remove existing vm node in xenstore
self._removeVm()
@@ -2962,6 +2964,9 @@ class XendDomainInfo:
@@ -2961,6 +2963,9 @@ class XendDomainInfo:
self._createDevices()
@ -142,7 +142,7 @@ Index: xen-4.1.1-testing/tools/python/xen/xend/XendDomainInfo.py
self.image.cleanupTmpImages()
self.info['start_time'] = time.time()
@@ -2986,6 +2991,8 @@ class XendDomainInfo:
@@ -2985,6 +2990,8 @@ class XendDomainInfo:
self.refresh_shutdown_lock.acquire()
try:
self.unwatchShutdown()
@ -151,14 +151,14 @@ Index: xen-4.1.1-testing/tools/python/xen/xend/XendDomainInfo.py
self._releaseDevices()
bootloader_tidy(self)
@@ -3070,6 +3077,7 @@ class XendDomainInfo:
@@ -3069,6 +3076,7 @@ class XendDomainInfo:
self.image = image.create(self, self.info)
if self.image:
self._createDevices(True)
+ self.image.createXenPaging()
self.console_port = console_port
self._storeDomDetails()
self._registerWatches()
self.refreshShutdown()
@@ -3210,6 +3218,8 @@ class XendDomainInfo:
# could also fetch a parsed note from xenstore
fast = self.info.get_notes().get('SUSPEND_CANCEL') and 1 or 0