327 lines
11 KiB
Diff
327 lines
11 KiB
Diff
|
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
|
||
|
|
||
|
x86/MSI-X: be more careful during teardown
|
||
|
|
||
|
When a device gets detached from a guest, pciback will clear its
|
||
|
command register, thus disabling both memory and I/O decoding. The
|
||
|
disabled memory decoding, however, has an effect on the MSI-X table
|
||
|
accesses the hypervisor does: These won't have the intended effect
|
||
|
anymore. Even worse, for PCIe devices (but not SR-IOV virtual
|
||
|
functions) such accesses may (will?) be treated as Unsupported
|
||
|
Requests, causing respective errors to be surfaced, potentially in the
|
||
|
form of NMIs that may be fatal to the hypervisor or Dom0 is different
|
||
|
ways. Hence rather than carrying out these accesses, we should avoid
|
||
|
them where we can, and use alternative (e.g. PCI config space based)
|
||
|
mechanisms to achieve at least the same effect.
|
||
|
|
||
|
At this time it continues to be unclear whether this is fixing an
|
||
|
actual bug or is rather just working around bogus (but apparently
|
||
|
common) system behavior.
|
||
|
|
||
|
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||
|
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||
|
---
|
||
|
The use of the mask-all bit here collides with qemu's incorreect use
|
||
|
of that same bit. This would become a security issue if released that
|
||
|
way. A later patch in this series will provide the infrastructure for
|
||
|
qemu to stop direct access to that bit. A qemu series including a patch
|
||
|
making use of the new interface will be sent subsequently.
|
||
|
|
||
|
Backporting note (largely to myself):
|
||
|
Depends on (not yet backported to 4.4 and earlier) commit 061eebe0e
|
||
|
"x86/MSI: drop workaround for insecure Dom0 kernels" (due to re-use
|
||
|
of struct arch_msix's warned field).
|
||
|
|
||
|
--- trunk.orig/xen/arch/x86/irq.c 2015-06-03 16:55:05.000000000 +0200
|
||
|
+++ trunk/xen/arch/x86/irq.c 2015-03-25 09:36:52.000000000 +0100
|
||
|
@@ -217,9 +217,9 @@ void destroy_irq(unsigned int irq)
|
||
|
}
|
||
|
|
||
|
spin_lock_irqsave(&desc->lock, flags);
|
||
|
- desc->status |= IRQ_DISABLED;
|
||
|
desc->status &= ~IRQ_GUEST;
|
||
|
desc->handler->shutdown(desc);
|
||
|
+ desc->status |= IRQ_DISABLED;
|
||
|
action = desc->action;
|
||
|
desc->action = NULL;
|
||
|
desc->msi_desc = NULL;
|
||
|
@@ -995,8 +995,8 @@ void __init release_irq(unsigned int irq
|
||
|
spin_lock_irqsave(&desc->lock,flags);
|
||
|
action = desc->action;
|
||
|
desc->action = NULL;
|
||
|
- desc->status |= IRQ_DISABLED;
|
||
|
desc->handler->shutdown(desc);
|
||
|
+ desc->status |= IRQ_DISABLED;
|
||
|
spin_unlock_irqrestore(&desc->lock,flags);
|
||
|
|
||
|
/* Wait to make sure it's not being used on another CPU */
|
||
|
@@ -1732,8 +1732,8 @@ static irq_guest_action_t *__pirq_guest_
|
||
|
BUG_ON(action->in_flight != 0);
|
||
|
|
||
|
/* Disabling IRQ before releasing the desc_lock avoids an IRQ storm. */
|
||
|
- desc->status |= IRQ_DISABLED;
|
||
|
desc->handler->disable(desc);
|
||
|
+ desc->status |= IRQ_DISABLED;
|
||
|
|
||
|
/*
|
||
|
* Mark any remaining pending EOIs as ready to flush.
|
||
|
--- trunk.orig/xen/arch/x86/msi.c 2015-05-19 23:16:48.000000000 +0200
|
||
|
+++ trunk/xen/arch/x86/msi.c 2015-03-25 09:35:38.000000000 +0100
|
||
|
@@ -121,6 +121,27 @@ static void msix_put_fixmap(struct arch_
|
||
|
spin_unlock(&msix->table_lock);
|
||
|
}
|
||
|
|
||
|
+static bool_t memory_decoded(const struct pci_dev *dev)
|
||
|
+{
|
||
|
+ u8 bus, slot, func;
|
||
|
+
|
||
|
+ if ( !dev->info.is_virtfn )
|
||
|
+ {
|
||
|
+ bus = dev->bus;
|
||
|
+ slot = PCI_SLOT(dev->devfn);
|
||
|
+ func = PCI_FUNC(dev->devfn);
|
||
|
+ }
|
||
|
+ else
|
||
|
+ {
|
||
|
+ bus = dev->info.physfn.bus;
|
||
|
+ slot = PCI_SLOT(dev->info.physfn.devfn);
|
||
|
+ func = PCI_FUNC(dev->info.physfn.devfn);
|
||
|
+ }
|
||
|
+
|
||
|
+ return !!(pci_conf_read16(dev->seg, bus, slot, func, PCI_COMMAND) &
|
||
|
+ PCI_COMMAND_MEMORY);
|
||
|
+}
|
||
|
+
|
||
|
/*
|
||
|
* MSI message composition
|
||
|
*/
|
||
|
@@ -162,7 +183,7 @@ void msi_compose_msg(unsigned vector, co
|
||
|
}
|
||
|
}
|
||
|
|
||
|
-static void read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
|
||
|
+static bool_t read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
|
||
|
{
|
||
|
switch ( entry->msi_attrib.type )
|
||
|
{
|
||
|
@@ -198,6 +219,8 @@ static void read_msi_msg(struct msi_desc
|
||
|
void __iomem *base;
|
||
|
base = entry->mask_base;
|
||
|
|
||
|
+ if ( unlikely(!memory_decoded(entry->dev)) )
|
||
|
+ return 0;
|
||
|
msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
|
||
|
msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
|
||
|
msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
|
||
|
@@ -209,6 +232,8 @@ static void read_msi_msg(struct msi_desc
|
||
|
|
||
|
if ( iommu_intremap )
|
||
|
iommu_read_msi_from_ire(entry, msg);
|
||
|
+
|
||
|
+ return 1;
|
||
|
}
|
||
|
|
||
|
static int write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
|
||
|
@@ -260,6 +285,8 @@ static int write_msi_msg(struct msi_desc
|
||
|
void __iomem *base;
|
||
|
base = entry->mask_base;
|
||
|
|
||
|
+ if ( unlikely(!memory_decoded(entry->dev)) )
|
||
|
+ return -ENXIO;
|
||
|
writel(msg->address_lo,
|
||
|
base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
|
||
|
writel(msg->address_hi,
|
||
|
@@ -287,7 +314,8 @@ void set_msi_affinity(struct irq_desc *d
|
||
|
ASSERT(spin_is_locked(&desc->lock));
|
||
|
|
||
|
memset(&msg, 0, sizeof(msg));
|
||
|
- read_msi_msg(msi_desc, &msg);
|
||
|
+ if ( !read_msi_msg(msi_desc, &msg) )
|
||
|
+ return;
|
||
|
|
||
|
msg.data &= ~MSI_DATA_VECTOR_MASK;
|
||
|
msg.data |= MSI_DATA_VECTOR(desc->arch.vector);
|
||
|
@@ -347,20 +375,24 @@ int msi_maskable_irq(const struct msi_de
|
||
|
|| entry->msi_attrib.maskbit;
|
||
|
}
|
||
|
|
||
|
-static void msi_set_mask_bit(struct irq_desc *desc, int flag)
|
||
|
+static bool_t msi_set_mask_bit(struct irq_desc *desc, int flag)
|
||
|
{
|
||
|
struct msi_desc *entry = desc->msi_desc;
|
||
|
+ struct pci_dev *pdev;
|
||
|
+ u16 seg;
|
||
|
+ u8 bus, slot, func;
|
||
|
|
||
|
ASSERT(spin_is_locked(&desc->lock));
|
||
|
BUG_ON(!entry || !entry->dev);
|
||
|
+ pdev = entry->dev;
|
||
|
+ seg = pdev->seg;
|
||
|
+ bus = pdev->bus;
|
||
|
+ slot = PCI_SLOT(pdev->devfn);
|
||
|
+ func = PCI_FUNC(pdev->devfn);
|
||
|
switch (entry->msi_attrib.type) {
|
||
|
case PCI_CAP_ID_MSI:
|
||
|
if (entry->msi_attrib.maskbit) {
|
||
|
u32 mask_bits;
|
||
|
- u16 seg = entry->dev->seg;
|
||
|
- u8 bus = entry->dev->bus;
|
||
|
- u8 slot = PCI_SLOT(entry->dev->devfn);
|
||
|
- u8 func = PCI_FUNC(entry->dev->devfn);
|
||
|
|
||
|
mask_bits = pci_conf_read32(seg, bus, slot, func, entry->msi.mpos);
|
||
|
mask_bits &= ~((u32)1 << entry->msi_attrib.entry_nr);
|
||
|
@@ -369,24 +401,52 @@ static void msi_set_mask_bit(struct irq_
|
||
|
}
|
||
|
break;
|
||
|
case PCI_CAP_ID_MSIX:
|
||
|
- {
|
||
|
- int offset = PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
|
||
|
- writel(flag, entry->mask_base + offset);
|
||
|
- readl(entry->mask_base + offset);
|
||
|
- break;
|
||
|
- }
|
||
|
+ if ( likely(memory_decoded(pdev)) )
|
||
|
+ {
|
||
|
+ writel(flag, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
|
||
|
+ readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
|
||
|
+ break;
|
||
|
+ }
|
||
|
+ if ( flag )
|
||
|
+ {
|
||
|
+ u16 control;
|
||
|
+ domid_t domid = pdev->domain->domain_id;
|
||
|
+
|
||
|
+ control = pci_conf_read16(seg, bus, slot, func,
|
||
|
+ msix_control_reg(entry->msi_attrib.pos));
|
||
|
+ if ( control & PCI_MSIX_FLAGS_MASKALL )
|
||
|
+ break;
|
||
|
+ pci_conf_write16(seg, bus, slot, func,
|
||
|
+ msix_control_reg(entry->msi_attrib.pos),
|
||
|
+ control | PCI_MSIX_FLAGS_MASKALL);
|
||
|
+ if ( pdev->msix->warned != domid )
|
||
|
+ {
|
||
|
+ pdev->msix->warned = domid;
|
||
|
+ printk(XENLOG_G_WARNING
|
||
|
+ "cannot mask IRQ %d: masked MSI-X on Dom%d's %04x:%02x:%02x.%u\n",
|
||
|
+ desc->irq, domid, pdev->seg, pdev->bus,
|
||
|
+ PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
|
||
|
+ }
|
||
|
+ break;
|
||
|
+ }
|
||
|
+ /* fall through */
|
||
|
default:
|
||
|
- BUG();
|
||
|
- break;
|
||
|
+ return 0;
|
||
|
}
|
||
|
entry->msi_attrib.masked = !!flag;
|
||
|
+
|
||
|
+ return 1;
|
||
|
}
|
||
|
|
||
|
static int msi_get_mask_bit(const struct msi_desc *entry)
|
||
|
{
|
||
|
- switch (entry->msi_attrib.type) {
|
||
|
+ if ( !entry->dev )
|
||
|
+ return -1;
|
||
|
+
|
||
|
+ switch ( entry->msi_attrib.type )
|
||
|
+ {
|
||
|
case PCI_CAP_ID_MSI:
|
||
|
- if (!entry->dev || !entry->msi_attrib.maskbit)
|
||
|
+ if ( !entry->msi_attrib.maskbit )
|
||
|
break;
|
||
|
return (pci_conf_read32(entry->dev->seg, entry->dev->bus,
|
||
|
PCI_SLOT(entry->dev->devfn),
|
||
|
@@ -394,6 +454,8 @@ static int msi_get_mask_bit(const struct
|
||
|
entry->msi.mpos) >>
|
||
|
entry->msi_attrib.entry_nr) & 1;
|
||
|
case PCI_CAP_ID_MSIX:
|
||
|
+ if ( unlikely(!memory_decoded(entry->dev)) )
|
||
|
+ break;
|
||
|
return readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) & 1;
|
||
|
}
|
||
|
return -1;
|
||
|
@@ -401,12 +463,14 @@ static int msi_get_mask_bit(const struct
|
||
|
|
||
|
void mask_msi_irq(struct irq_desc *desc)
|
||
|
{
|
||
|
- msi_set_mask_bit(desc, 1);
|
||
|
+ if ( unlikely(!msi_set_mask_bit(desc, 1)) )
|
||
|
+ BUG_ON(!(desc->status & IRQ_DISABLED));
|
||
|
}
|
||
|
|
||
|
void unmask_msi_irq(struct irq_desc *desc)
|
||
|
{
|
||
|
- msi_set_mask_bit(desc, 0);
|
||
|
+ if ( unlikely(!msi_set_mask_bit(desc, 0)) )
|
||
|
+ WARN();
|
||
|
}
|
||
|
|
||
|
static unsigned int startup_msi_irq(struct irq_desc *desc)
|
||
|
@@ -723,6 +787,9 @@ static int msix_capability_init(struct p
|
||
|
control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
|
||
|
msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
|
||
|
|
||
|
+ if ( unlikely(!memory_decoded(dev)) )
|
||
|
+ return -ENXIO;
|
||
|
+
|
||
|
if ( desc )
|
||
|
{
|
||
|
entry = alloc_msi_entry(1);
|
||
|
@@ -855,7 +922,8 @@ static int msix_capability_init(struct p
|
||
|
++msix->used_entries;
|
||
|
|
||
|
/* Restore MSI-X enabled bits */
|
||
|
- pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
|
||
|
+ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
|
||
|
+ control & ~PCI_MSIX_FLAGS_MASKALL);
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
@@ -1008,8 +1076,16 @@ static void __pci_disable_msix(struct ms
|
||
|
|
||
|
BUG_ON(list_empty(&dev->msi_list));
|
||
|
|
||
|
- writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
|
||
|
-
|
||
|
+ if ( likely(memory_decoded(dev)) )
|
||
|
+ writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
|
||
|
+ else if ( !(control & PCI_MSIX_FLAGS_MASKALL) )
|
||
|
+ {
|
||
|
+ printk(XENLOG_WARNING
|
||
|
+ "cannot disable IRQ %d: masking MSI-X on %04x:%02x:%02x.%u\n",
|
||
|
+ entry->irq, dev->seg, dev->bus,
|
||
|
+ PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
|
||
|
+ control |= PCI_MSIX_FLAGS_MASKALL;
|
||
|
+ }
|
||
|
pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
|
||
|
|
||
|
_pci_cleanup_msix(dev->msix);
|
||
|
@@ -1147,14 +1223,23 @@ int pci_restore_msi_state(struct pci_dev
|
||
|
nr = entry->msi.nvec;
|
||
|
}
|
||
|
else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
|
||
|
+ {
|
||
|
msix_set_enable(pdev, 0);
|
||
|
+ if ( unlikely(!memory_decoded(pdev)) )
|
||
|
+ {
|
||
|
+ spin_unlock_irqrestore(&desc->lock, flags);
|
||
|
+ return -ENXIO;
|
||
|
+ }
|
||
|
+ }
|
||
|
|
||
|
msg = entry->msg;
|
||
|
write_msi_msg(entry, &msg);
|
||
|
|
||
|
for ( i = 0; ; )
|
||
|
{
|
||
|
- msi_set_mask_bit(desc, entry[i].msi_attrib.masked);
|
||
|
+ if ( unlikely(!msi_set_mask_bit(desc,
|
||
|
+ entry[i].msi_attrib.masked)) )
|
||
|
+ BUG();
|
||
|
|
||
|
if ( !--nr )
|
||
|
break;
|