- Upstream patches from Jan
5281fad4-numa-sched-leave-node-affinity-alone-if-not-in-auto-mode.patch 52820823-nested-SVM-adjust-guest-handling-of-structure-mappings.patch 52820863-VMX-don-t-crash-processing-d-debug-key.patch 5282492f-x86-eliminate-has_arch_mmios.patch 52864df2-credit-Update-other-parameters-when-setting-tslice_ms.patch 52864f30-fix-leaking-of-v-cpu_affinity_saved-on-domain-destruction.patch 5289d225-nested-VMX-don-t-ignore-mapping-errors.patch 528a0eb0-x86-consider-modules-when-cutting-off-memory.patch 528f606c-x86-hvm-reset-TSC-to-0-after-domain-resume-from-S3.patch 528f609c-x86-crash-disable-the-watchdog-NMIs-on-the-crashing-cpu.patch 52932418-x86-xsave-fix-nonlazy-state-handling.patch - Add missing requires to pciutils package for xend-tools - bnc#851749 - Xen service file does not call xend properly xend.service - bnc#851386 - VUL-0: xen: XSA-78: Insufficient TLB flushing in VT-d (iommu) code 528a0e5b-TLB-flushing-in-dma_pte_clear_one.patch - bnc#849667 - VUL-0: xen: XSA-74: Lock order reversal between page_alloc_lock and mm_rwlock CVE-2013-4553-xsa74.patch - bnc#849665 - VUL-0: CVE-2013-4551: xen: XSA-75: Host crash due to guest VMX instruction execution 52809208-nested-VMX-VMLANUCH-VMRESUME-emulation-must-check-permission-1st.patch - bnc#849668 - VUL-0: xen: XSA-76: Hypercalls exposed to privilege rings 1 and 2 of HVM guests OBS-URL: https://build.opensuse.org/package/show/Virtualization/xen?expand=0&rev=279
This commit is contained in:
parent
731bb529f3
commit
a11c33863f
@ -1,44 +0,0 @@
|
||||
# Commit 5ad914bc867c5a6a4957869c89918f4e1f9dd9c4
|
||||
# Date 2013-07-02 08:48:03 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: don't pass negative time to gtime_to_gtsc() (try 2)
|
||||
|
||||
This mostly reverts commit eb60be3d ("x86: don't pass negative time to
|
||||
gtime_to_gtsc()") and instead corrects __update_vcpu_system_time()'s
|
||||
handling of this_cpu(cpu_time).stime_local_stamp dating back before the
|
||||
start of a HVM guest (which would otherwise lead to a negative value
|
||||
getting passed to gtime_to_gtsc(), causing scale_delta() to produce
|
||||
meaningless output).
|
||||
|
||||
Flushing the value to zero was wrong, and printing a message for
|
||||
something that can validly happen wasn't very useful either.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/time.c
|
||||
+++ b/xen/arch/x86/time.c
|
||||
@@ -823,16 +823,13 @@ static void __update_vcpu_system_time(st
|
||||
struct pl_time *pl = &v->domain->arch.hvm_domain.pl_time;
|
||||
|
||||
stime += pl->stime_offset + v->arch.hvm_vcpu.stime_offset;
|
||||
- if ( (s64)stime < 0 )
|
||||
- {
|
||||
- printk(XENLOG_G_WARNING "d%dv%d: bogus time %" PRId64
|
||||
- " (offsets %" PRId64 "/%" PRId64 ")\n",
|
||||
- d->domain_id, v->vcpu_id, stime,
|
||||
- pl->stime_offset, v->arch.hvm_vcpu.stime_offset);
|
||||
- stime = 0;
|
||||
- }
|
||||
+ if ( stime >= 0 )
|
||||
+ tsc_stamp = gtime_to_gtsc(d, stime);
|
||||
+ else
|
||||
+ tsc_stamp = -gtime_to_gtsc(d, -stime);
|
||||
}
|
||||
- tsc_stamp = gtime_to_gtsc(d, stime);
|
||||
+ else
|
||||
+ tsc_stamp = gtime_to_gtsc(d, stime);
|
||||
}
|
||||
else
|
||||
{
|
@ -1,285 +0,0 @@
|
||||
# Commit 2823a0c7dfc979db316787e1dd42a8845e5825c0
|
||||
# Date 2013-07-02 08:49:43 +0200
|
||||
# Author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
iommu/amd: Fix logic for clearing the IOMMU interrupt bits
|
||||
|
||||
The IOMMU interrupt bits in the IOMMU status registers are
|
||||
"read-only, and write-1-to-clear (RW1C). Therefore, the existing
|
||||
logic which reads the register, set the bit, and then writing back
|
||||
the values could accidentally clear certain bits if it has been set.
|
||||
|
||||
The correct logic would just be writing only the value which only
|
||||
set the interrupt bits, and leave the rest to zeros.
|
||||
|
||||
This patch also, clean up #define masks as Jan has suggested.
|
||||
|
||||
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
|
||||
With iommu_interrupt_handler() properly having got switched its readl()
|
||||
from status to control register, the subsequent writel() needed to be
|
||||
switched too (and the RW1C comment there was bogus).
|
||||
|
||||
Some of the cleanup went too far - undone.
|
||||
|
||||
Further, with iommu_interrupt_handler() now actually disabling the
|
||||
interrupt sources, they also need to get re-enabled by the tasklet once
|
||||
it finished processing the respective log. This also implies re-running
|
||||
the tasklet so that log entries added between reading the log and re-
|
||||
enabling the interrupt will get handled in a timely manner.
|
||||
|
||||
Finally, guest write emulation to the status register needs to be done
|
||||
with the RW1C (and RO for all other bits) semantics in mind too.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Tim Deegan <tim@xen.org>
|
||||
Acked-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
|
||||
--- a/xen/drivers/passthrough/amd/iommu_cmd.c
|
||||
+++ b/xen/drivers/passthrough/amd/iommu_cmd.c
|
||||
@@ -75,11 +75,9 @@ static void flush_command_buffer(struct
|
||||
u32 cmd[4], status;
|
||||
int loop_count, comp_wait;
|
||||
|
||||
- /* clear 'ComWaitInt' in status register (WIC) */
|
||||
- set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, 0,
|
||||
- IOMMU_STATUS_COMP_WAIT_INT_MASK,
|
||||
- IOMMU_STATUS_COMP_WAIT_INT_SHIFT, &status);
|
||||
- writel(status, iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
|
||||
+ /* RW1C 'ComWaitInt' in status register */
|
||||
+ writel(IOMMU_STATUS_COMP_WAIT_INT_MASK,
|
||||
+ iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
|
||||
|
||||
/* send an empty COMPLETION_WAIT command to flush command buffer */
|
||||
cmd[3] = cmd[2] = 0;
|
||||
@@ -103,9 +101,9 @@ static void flush_command_buffer(struct
|
||||
|
||||
if ( comp_wait )
|
||||
{
|
||||
- /* clear 'ComWaitInt' in status register (WIC) */
|
||||
- status &= IOMMU_STATUS_COMP_WAIT_INT_MASK;
|
||||
- writel(status, iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
|
||||
+ /* RW1C 'ComWaitInt' in status register */
|
||||
+ writel(IOMMU_STATUS_COMP_WAIT_INT_MASK,
|
||||
+ iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
|
||||
return;
|
||||
}
|
||||
AMD_IOMMU_DEBUG("Warning: ComWaitInt bit did not assert!\n");
|
||||
--- a/xen/drivers/passthrough/amd/iommu_guest.c
|
||||
+++ b/xen/drivers/passthrough/amd/iommu_guest.c
|
||||
@@ -754,7 +754,14 @@ static void guest_iommu_mmio_write64(str
|
||||
u64_to_reg(&iommu->ppr_log.reg_tail, val);
|
||||
break;
|
||||
case IOMMU_STATUS_MMIO_OFFSET:
|
||||
- u64_to_reg(&iommu->reg_status, val);
|
||||
+ val &= IOMMU_STATUS_EVENT_OVERFLOW_MASK |
|
||||
+ IOMMU_STATUS_EVENT_LOG_INT_MASK |
|
||||
+ IOMMU_STATUS_COMP_WAIT_INT_MASK |
|
||||
+ IOMMU_STATUS_PPR_LOG_OVERFLOW_MASK |
|
||||
+ IOMMU_STATUS_PPR_LOG_INT_MASK |
|
||||
+ IOMMU_STATUS_GAPIC_LOG_OVERFLOW_MASK |
|
||||
+ IOMMU_STATUS_GAPIC_LOG_INT_MASK;
|
||||
+ u64_to_reg(&iommu->reg_status, reg_to_u64(iommu->reg_status) & ~val);
|
||||
break;
|
||||
|
||||
default:
|
||||
--- a/xen/drivers/passthrough/amd/iommu_init.c
|
||||
+++ b/xen/drivers/passthrough/amd/iommu_init.c
|
||||
@@ -344,13 +344,13 @@ static void set_iommu_ppr_log_control(st
|
||||
writeq(0, iommu->mmio_base + IOMMU_PPR_LOG_TAIL_OFFSET);
|
||||
|
||||
iommu_set_bit(&entry, IOMMU_CONTROL_PPR_ENABLE_SHIFT);
|
||||
- iommu_set_bit(&entry, IOMMU_CONTROL_PPR_INT_SHIFT);
|
||||
+ iommu_set_bit(&entry, IOMMU_CONTROL_PPR_LOG_INT_SHIFT);
|
||||
iommu_set_bit(&entry, IOMMU_CONTROL_PPR_LOG_ENABLE_SHIFT);
|
||||
}
|
||||
else
|
||||
{
|
||||
iommu_clear_bit(&entry, IOMMU_CONTROL_PPR_ENABLE_SHIFT);
|
||||
- iommu_clear_bit(&entry, IOMMU_CONTROL_PPR_INT_SHIFT);
|
||||
+ iommu_clear_bit(&entry, IOMMU_CONTROL_PPR_LOG_INT_SHIFT);
|
||||
iommu_clear_bit(&entry, IOMMU_CONTROL_PPR_LOG_ENABLE_SHIFT);
|
||||
}
|
||||
|
||||
@@ -410,7 +410,7 @@ static void iommu_reset_log(struct amd_i
|
||||
void (*ctrl_func)(struct amd_iommu *iommu, int))
|
||||
{
|
||||
u32 entry;
|
||||
- int log_run, run_bit, of_bit;
|
||||
+ int log_run, run_bit;
|
||||
int loop_count = 1000;
|
||||
|
||||
BUG_ON(!iommu || ((log != &iommu->event_log) && (log != &iommu->ppr_log)));
|
||||
@@ -419,10 +419,6 @@ static void iommu_reset_log(struct amd_i
|
||||
IOMMU_STATUS_EVENT_LOG_RUN_SHIFT :
|
||||
IOMMU_STATUS_PPR_LOG_RUN_SHIFT;
|
||||
|
||||
- of_bit = ( log == &iommu->event_log ) ?
|
||||
- IOMMU_STATUS_EVENT_OVERFLOW_SHIFT :
|
||||
- IOMMU_STATUS_PPR_LOG_OVERFLOW_SHIFT;
|
||||
-
|
||||
/* wait until EventLogRun bit = 0 */
|
||||
do {
|
||||
entry = readl(iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
|
||||
@@ -439,9 +435,10 @@ static void iommu_reset_log(struct amd_i
|
||||
|
||||
ctrl_func(iommu, IOMMU_CONTROL_DISABLED);
|
||||
|
||||
- /*clear overflow bit */
|
||||
- iommu_clear_bit(&entry, of_bit);
|
||||
- writel(entry, iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
|
||||
+ /* RW1C overflow bit */
|
||||
+ writel(log == &iommu->event_log ? IOMMU_STATUS_EVENT_OVERFLOW_MASK
|
||||
+ : IOMMU_STATUS_PPR_LOG_OVERFLOW_MASK,
|
||||
+ iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
|
||||
|
||||
/*reset event log base address */
|
||||
log->head = 0;
|
||||
@@ -611,22 +608,33 @@ static void iommu_check_event_log(struct
|
||||
u32 entry;
|
||||
unsigned long flags;
|
||||
|
||||
+ /* RW1C interrupt status bit */
|
||||
+ writel(IOMMU_STATUS_EVENT_LOG_INT_MASK,
|
||||
+ iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
|
||||
+
|
||||
iommu_read_log(iommu, &iommu->event_log,
|
||||
sizeof(event_entry_t), parse_event_log_entry);
|
||||
|
||||
spin_lock_irqsave(&iommu->lock, flags);
|
||||
|
||||
- /*check event overflow */
|
||||
+ /* Check event overflow. */
|
||||
entry = readl(iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
|
||||
-
|
||||
if ( iommu_get_bit(entry, IOMMU_STATUS_EVENT_OVERFLOW_SHIFT) )
|
||||
iommu_reset_log(iommu, &iommu->event_log, set_iommu_event_log_control);
|
||||
-
|
||||
- /* reset interrupt status bit */
|
||||
- entry = readl(iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
|
||||
- iommu_set_bit(&entry, IOMMU_STATUS_EVENT_LOG_INT_SHIFT);
|
||||
-
|
||||
- writel(entry, iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
|
||||
+ else
|
||||
+ {
|
||||
+ entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
|
||||
+ if ( !(entry & IOMMU_CONTROL_EVENT_LOG_INT_MASK) )
|
||||
+ {
|
||||
+ entry |= IOMMU_CONTROL_EVENT_LOG_INT_MASK;
|
||||
+ writel(entry, iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
|
||||
+ /*
|
||||
+ * Re-schedule the tasklet to handle eventual log entries added
|
||||
+ * between reading the log above and re-enabling the interrupt.
|
||||
+ */
|
||||
+ tasklet_schedule(&amd_iommu_irq_tasklet);
|
||||
+ }
|
||||
+ }
|
||||
|
||||
spin_unlock_irqrestore(&iommu->lock, flags);
|
||||
}
|
||||
@@ -681,22 +689,33 @@ static void iommu_check_ppr_log(struct a
|
||||
u32 entry;
|
||||
unsigned long flags;
|
||||
|
||||
+ /* RW1C interrupt status bit */
|
||||
+ writel(IOMMU_STATUS_PPR_LOG_INT_MASK,
|
||||
+ iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
|
||||
+
|
||||
iommu_read_log(iommu, &iommu->ppr_log,
|
||||
sizeof(ppr_entry_t), parse_ppr_log_entry);
|
||||
|
||||
spin_lock_irqsave(&iommu->lock, flags);
|
||||
|
||||
- /*check event overflow */
|
||||
+ /* Check event overflow. */
|
||||
entry = readl(iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
|
||||
-
|
||||
if ( iommu_get_bit(entry, IOMMU_STATUS_PPR_LOG_OVERFLOW_SHIFT) )
|
||||
iommu_reset_log(iommu, &iommu->ppr_log, set_iommu_ppr_log_control);
|
||||
-
|
||||
- /* reset interrupt status bit */
|
||||
- entry = readl(iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
|
||||
- iommu_set_bit(&entry, IOMMU_STATUS_PPR_LOG_INT_SHIFT);
|
||||
-
|
||||
- writel(entry, iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
|
||||
+ else
|
||||
+ {
|
||||
+ entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
|
||||
+ if ( !(entry & IOMMU_CONTROL_PPR_LOG_INT_MASK) )
|
||||
+ {
|
||||
+ entry |= IOMMU_CONTROL_PPR_LOG_INT_MASK;
|
||||
+ writel(entry, iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
|
||||
+ /*
|
||||
+ * Re-schedule the tasklet to handle eventual log entries added
|
||||
+ * between reading the log above and re-enabling the interrupt.
|
||||
+ */
|
||||
+ tasklet_schedule(&amd_iommu_irq_tasklet);
|
||||
+ }
|
||||
+ }
|
||||
|
||||
spin_unlock_irqrestore(&iommu->lock, flags);
|
||||
}
|
||||
@@ -733,11 +752,14 @@ static void iommu_interrupt_handler(int
|
||||
|
||||
spin_lock_irqsave(&iommu->lock, flags);
|
||||
|
||||
- /* Silence interrupts from both event and PPR logging */
|
||||
- entry = readl(iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
|
||||
- iommu_clear_bit(&entry, IOMMU_STATUS_EVENT_LOG_INT_SHIFT);
|
||||
- iommu_clear_bit(&entry, IOMMU_STATUS_PPR_LOG_INT_SHIFT);
|
||||
- writel(entry, iommu->mmio_base+IOMMU_STATUS_MMIO_OFFSET);
|
||||
+ /*
|
||||
+ * Silence interrupts from both event and PPR by clearing the
|
||||
+ * enable logging bits in the control register
|
||||
+ */
|
||||
+ entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
|
||||
+ iommu_clear_bit(&entry, IOMMU_CONTROL_EVENT_LOG_INT_SHIFT);
|
||||
+ iommu_clear_bit(&entry, IOMMU_CONTROL_PPR_LOG_INT_SHIFT);
|
||||
+ writel(entry, iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
|
||||
|
||||
spin_unlock_irqrestore(&iommu->lock, flags);
|
||||
|
||||
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
|
||||
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
|
||||
@@ -336,14 +336,17 @@
|
||||
#define IOMMU_CONTROL_ISOCHRONOUS_SHIFT 11
|
||||
#define IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_MASK 0x00001000
|
||||
#define IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_SHIFT 12
|
||||
+#define IOMMU_CONTROL_PPR_LOG_ENABLE_MASK 0x00002000
|
||||
+#define IOMMU_CONTROL_PPR_LOG_ENABLE_SHIFT 13
|
||||
+#define IOMMU_CONTROL_PPR_LOG_INT_MASK 0x00004000
|
||||
+#define IOMMU_CONTROL_PPR_LOG_INT_SHIFT 14
|
||||
+#define IOMMU_CONTROL_PPR_ENABLE_MASK 0x00008000
|
||||
+#define IOMMU_CONTROL_PPR_ENABLE_SHIFT 15
|
||||
+#define IOMMU_CONTROL_GT_ENABLE_MASK 0x00010000
|
||||
+#define IOMMU_CONTROL_GT_ENABLE_SHIFT 16
|
||||
#define IOMMU_CONTROL_RESTART_MASK 0x80000000
|
||||
#define IOMMU_CONTROL_RESTART_SHIFT 31
|
||||
|
||||
-#define IOMMU_CONTROL_PPR_LOG_ENABLE_SHIFT 13
|
||||
-#define IOMMU_CONTROL_PPR_INT_SHIFT 14
|
||||
-#define IOMMU_CONTROL_PPR_ENABLE_SHIFT 15
|
||||
-#define IOMMU_CONTROL_GT_ENABLE_SHIFT 16
|
||||
-
|
||||
/* Exclusion Register */
|
||||
#define IOMMU_EXCLUSION_BASE_LOW_OFFSET 0x20
|
||||
#define IOMMU_EXCLUSION_BASE_HIGH_OFFSET 0x24
|
||||
@@ -395,9 +398,18 @@
|
||||
#define IOMMU_STATUS_EVENT_LOG_RUN_SHIFT 3
|
||||
#define IOMMU_STATUS_CMD_BUFFER_RUN_MASK 0x00000010
|
||||
#define IOMMU_STATUS_CMD_BUFFER_RUN_SHIFT 4
|
||||
+#define IOMMU_STATUS_PPR_LOG_OVERFLOW_MASK 0x00000020
|
||||
#define IOMMU_STATUS_PPR_LOG_OVERFLOW_SHIFT 5
|
||||
+#define IOMMU_STATUS_PPR_LOG_INT_MASK 0x00000040
|
||||
#define IOMMU_STATUS_PPR_LOG_INT_SHIFT 6
|
||||
+#define IOMMU_STATUS_PPR_LOG_RUN_MASK 0x00000080
|
||||
#define IOMMU_STATUS_PPR_LOG_RUN_SHIFT 7
|
||||
+#define IOMMU_STATUS_GAPIC_LOG_OVERFLOW_MASK 0x00000100
|
||||
+#define IOMMU_STATUS_GAPIC_LOG_OVERFLOW_SHIFT 8
|
||||
+#define IOMMU_STATUS_GAPIC_LOG_INT_MASK 0x00000200
|
||||
+#define IOMMU_STATUS_GAPIC_LOG_INT_SHIFT 9
|
||||
+#define IOMMU_STATUS_GAPIC_LOG_RUN_MASK 0x00000400
|
||||
+#define IOMMU_STATUS_GAPIC_LOG_RUN_SHIFT 10
|
||||
|
||||
/* I/O Page Table */
|
||||
#define IOMMU_PAGE_TABLE_ENTRY_SIZE 8
|
@ -1,57 +0,0 @@
|
||||
# Commit 9eabb0735400e2b6059dfa3f0b47a426f61f570a
|
||||
# Date 2013-07-02 08:50:41 +0200
|
||||
# Author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
iommu/amd: Workaround for erratum 787
|
||||
|
||||
The IOMMU interrupt handling in bottom half must clear the PPR log interrupt
|
||||
and event log interrupt bits to re-enable the interrupt. This is done by
|
||||
writing 1 to the memory mapped register to clear the bit. Due to hardware bug,
|
||||
if the driver tries to clear this bit while the IOMMU hardware also setting
|
||||
this bit, the conflict will result with the bit being set. If the interrupt
|
||||
handling code does not make sure to clear this bit, subsequent changes in the
|
||||
event/PPR logs will no longer generating interrupts, and would result if
|
||||
buffer overflow. After clearing the bits, the driver must read back
|
||||
the register to verify.
|
||||
|
||||
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
|
||||
Adjust to apply on top of heavily modified patch 1. Adjust flow to get away
|
||||
with a single readl() in each instance of the status register checks.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Tim Deegan <tim@xen.org>
|
||||
Acked-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
|
||||
--- a/xen/drivers/passthrough/amd/iommu_init.c
|
||||
+++ b/xen/drivers/passthrough/amd/iommu_init.c
|
||||
@@ -636,6 +636,14 @@ static void iommu_check_event_log(struct
|
||||
}
|
||||
}
|
||||
|
||||
+ /*
|
||||
+ * Workaround for erratum787:
|
||||
+ * Re-check to make sure the bit has been cleared.
|
||||
+ */
|
||||
+ entry = readl(iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
|
||||
+ if ( entry & IOMMU_STATUS_EVENT_LOG_INT_MASK )
|
||||
+ tasklet_schedule(&amd_iommu_irq_tasklet);
|
||||
+
|
||||
spin_unlock_irqrestore(&iommu->lock, flags);
|
||||
}
|
||||
|
||||
@@ -717,6 +725,14 @@ static void iommu_check_ppr_log(struct a
|
||||
}
|
||||
}
|
||||
|
||||
+ /*
|
||||
+ * Workaround for erratum787:
|
||||
+ * Re-check to make sure the bit has been cleared.
|
||||
+ */
|
||||
+ entry = readl(iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
|
||||
+ if ( entry & IOMMU_STATUS_PPR_LOG_INT_MASK )
|
||||
+ tasklet_schedule(&amd_iommu_irq_tasklet);
|
||||
+
|
||||
spin_unlock_irqrestore(&iommu->lock, flags);
|
||||
}
|
||||
|
@ -1,30 +0,0 @@
|
||||
# Commit d3a55d7d9bb518efe08143d050deff9f4ee80ec1
|
||||
# Date 2013-07-04 10:33:18 +0200
|
||||
# Author Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/mm: Ensure useful progress in alloc_l2_table()
|
||||
|
||||
While debugging the issue which turned out to be XSA-58, a printk in this loop
|
||||
showed that it was quite easy to never make useful progress, because of
|
||||
consistently failing the preemption check.
|
||||
|
||||
One single l2 entry is a reasonable amount of work to do, even if an action is
|
||||
pending, and also assures forwards progress across repeat continuations.
|
||||
|
||||
Tweak the continuation criteria to fail on the first iteration of the loop.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -1278,7 +1278,8 @@ static int alloc_l2_table(struct page_in
|
||||
|
||||
for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
|
||||
{
|
||||
- if ( preemptible && i && hypercall_preempt_check() )
|
||||
+ if ( preemptible && i > page->nr_validated_ptes
|
||||
+ && hypercall_preempt_check() )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
rc = -EAGAIN;
|
@ -1,37 +0,0 @@
|
||||
References: bnc#817799
|
||||
|
||||
# Commit 4867685f7916bb594a67f2f64a28bbf5ecb4949c
|
||||
# Date 2013-07-08 13:20:20 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
Revert "hvmloader: always include HPET table"
|
||||
|
||||
This reverts commit e4fd0475a08fda414da27c4e57b568f147cfc07e.
|
||||
|
||||
Conflicts:
|
||||
tools/firmware/hvmloader/acpi/build.c
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Keir Fraser <keir.xen@gmail.com>
|
||||
|
||||
--- a/tools/firmware/hvmloader/acpi/build.c
|
||||
+++ b/tools/firmware/hvmloader/acpi/build.c
|
||||
@@ -268,11 +268,13 @@ static int construct_secondary_tables(un
|
||||
table_ptrs[nr_tables++] = (unsigned long)madt;
|
||||
}
|
||||
|
||||
- /* HPET. Always included in DSDT, so always include it here too. */
|
||||
- /* (And it's unconditionally required by Windows SVVP tests.) */
|
||||
- hpet = construct_hpet();
|
||||
- if (!hpet) return -1;
|
||||
- table_ptrs[nr_tables++] = (unsigned long)hpet;
|
||||
+ /* HPET. */
|
||||
+ if ( hpet_exists(ACPI_HPET_ADDRESS) )
|
||||
+ {
|
||||
+ hpet = construct_hpet();
|
||||
+ if (!hpet) return -1;
|
||||
+ table_ptrs[nr_tables++] = (unsigned long)hpet;
|
||||
+ }
|
||||
|
||||
/* WAET. */
|
||||
waet = construct_waet();
|
@ -1,27 +0,0 @@
|
||||
# Commit 5656b93d215d7c5160790ea87758625ba1de16b1
|
||||
# Date 2013-07-10 10:03:40 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
adjust x86 EFI build
|
||||
|
||||
While the rule to generate .init.o files from .o ones already correctly
|
||||
included $(extra-y), the setting of the necessary compiler flag didn't
|
||||
have the same. With some yet to be posted patch this resulted in build
|
||||
breakage because of the compiler deciding not to inline a few functions
|
||||
(which then results in .text not being empty as required for these
|
||||
object files).
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/Rules.mk
|
||||
+++ b/xen/Rules.mk
|
||||
@@ -101,7 +101,7 @@ obj-y := $(patsubst %/,%/built-in.o,$
|
||||
|
||||
subdir-all := $(subdir-y) $(subdir-n)
|
||||
|
||||
-$(filter %.init.o,$(obj-y) $(obj-bin-y)): CFLAGS += -DINIT_SECTIONS_ONLY
|
||||
+$(filter %.init.o,$(obj-y) $(obj-bin-y) $(extra-y)): CFLAGS += -DINIT_SECTIONS_ONLY
|
||||
|
||||
$(obj-$(coverage)): CFLAGS += -fprofile-arcs -ftest-coverage -DTEST_COVERAGE
|
||||
|
@ -31,8 +31,10 @@ Reported-by: Sander Eikelenboom <linux@eikelenboom.it>
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
|
||||
--- a/xen/drivers/passthrough/amd/iommu_acpi.c
|
||||
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
|
||||
Index: xen-4.3.1-testing/xen/drivers/passthrough/amd/iommu_acpi.c
|
||||
===================================================================
|
||||
--- xen-4.3.1-testing.orig/xen/drivers/passthrough/amd/iommu_acpi.c
|
||||
+++ xen-4.3.1-testing/xen/drivers/passthrough/amd/iommu_acpi.c
|
||||
@@ -72,12 +72,15 @@ static void __init add_ivrs_mapping_entr
|
||||
/* allocate per-device interrupt remapping table */
|
||||
if ( amd_iommu_perdev_intremap )
|
||||
@ -51,16 +53,16 @@ Acked-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
}
|
||||
}
|
||||
/* assgin iommu hardware */
|
||||
@@ -671,7 +674,7 @@ static u16 __init parse_ivhd_device_spec
|
||||
if ( IO_APIC_ID(apic) != special->handle )
|
||||
continue;
|
||||
@@ -678,7 +681,7 @@ static u16 __init parse_ivhd_device_spec
|
||||
return 0;
|
||||
}
|
||||
|
||||
- if ( ioapic_sbdf[special->handle].pin_setup )
|
||||
+ if ( ioapic_sbdf[special->handle].pin_2_idx )
|
||||
{
|
||||
if ( ioapic_sbdf[special->handle].bdf == bdf &&
|
||||
ioapic_sbdf[special->handle].seg == seg )
|
||||
@@ -691,14 +694,17 @@ static u16 __init parse_ivhd_device_spec
|
||||
@@ -698,14 +701,17 @@ static u16 __init parse_ivhd_device_spec
|
||||
ioapic_sbdf[special->handle].bdf = bdf;
|
||||
ioapic_sbdf[special->handle].seg = seg;
|
||||
|
||||
@ -81,7 +83,7 @@ Acked-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -926,7 +932,7 @@ static int __init parse_ivrs_table(struc
|
||||
@@ -933,7 +939,7 @@ static int __init parse_ivrs_table(struc
|
||||
for ( apic = 0; !error && iommu_intremap && apic < nr_ioapics; ++apic )
|
||||
{
|
||||
if ( !nr_ioapic_entries[apic] ||
|
||||
@ -90,7 +92,7 @@ Acked-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
continue;
|
||||
|
||||
printk(XENLOG_ERR "IVHD Error: no information for IO-APIC %#x\n",
|
||||
@@ -935,9 +941,12 @@ static int __init parse_ivrs_table(struc
|
||||
@@ -942,9 +948,12 @@ static int __init parse_ivrs_table(struc
|
||||
error = -ENXIO;
|
||||
else
|
||||
{
|
||||
@ -106,8 +108,10 @@ Acked-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
{
|
||||
printk(XENLOG_ERR "IVHD Error: Out of memory\n");
|
||||
error = -ENOMEM;
|
||||
--- a/xen/drivers/passthrough/amd/iommu_intr.c
|
||||
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
|
||||
Index: xen-4.3.1-testing/xen/drivers/passthrough/amd/iommu_intr.c
|
||||
===================================================================
|
||||
--- xen-4.3.1-testing.orig/xen/drivers/passthrough/amd/iommu_intr.c
|
||||
+++ xen-4.3.1-testing/xen/drivers/passthrough/amd/iommu_intr.c
|
||||
@@ -31,6 +31,7 @@
|
||||
struct ioapic_sbdf ioapic_sbdf[MAX_IO_APICS];
|
||||
struct hpet_sbdf hpet_sbdf;
|
||||
@ -580,8 +584,10 @@ Acked-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
+
|
||||
+ return rc;
|
||||
}
|
||||
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
|
||||
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
|
||||
Index: xen-4.3.1-testing/xen/drivers/passthrough/amd/pci_amd_iommu.c
|
||||
===================================================================
|
||||
--- xen-4.3.1-testing.orig/xen/drivers/passthrough/amd/pci_amd_iommu.c
|
||||
+++ xen-4.3.1-testing/xen/drivers/passthrough/amd/pci_amd_iommu.c
|
||||
@@ -637,7 +637,7 @@ const struct iommu_ops amd_iommu_ops = {
|
||||
.get_device_group_id = amd_iommu_group_id,
|
||||
.update_ire_from_apic = amd_iommu_ioapic_update_ire,
|
||||
@ -591,8 +597,10 @@ Acked-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
.read_msi_from_ire = amd_iommu_read_msi_from_ire,
|
||||
.setup_hpet_msi = amd_setup_hpet_msi,
|
||||
.suspend = amd_iommu_suspend,
|
||||
--- a/xen/include/asm-x86/amd-iommu.h
|
||||
+++ b/xen/include/asm-x86/amd-iommu.h
|
||||
Index: xen-4.3.1-testing/xen/include/asm-x86/amd-iommu.h
|
||||
===================================================================
|
||||
--- xen-4.3.1-testing.orig/xen/include/asm-x86/amd-iommu.h
|
||||
+++ xen-4.3.1-testing/xen/include/asm-x86/amd-iommu.h
|
||||
@@ -119,6 +119,7 @@ struct ivrs_mappings {
|
||||
|
||||
/* per device interrupt remapping table */
|
||||
@ -601,8 +609,10 @@ Acked-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
spinlock_t intremap_lock;
|
||||
|
||||
/* ivhd device data settings */
|
||||
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
|
||||
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
|
||||
Index: xen-4.3.1-testing/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
|
||||
===================================================================
|
||||
--- xen-4.3.1-testing.orig/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
|
||||
+++ xen-4.3.1-testing/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
|
||||
@@ -470,10 +470,6 @@
|
||||
#define MAX_AMD_IOMMUS 32
|
||||
|
||||
@ -614,8 +624,10 @@ Acked-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
#define INT_REMAP_ENTRY_REMAPEN_MASK 0x00000001
|
||||
#define INT_REMAP_ENTRY_REMAPEN_SHIFT 0
|
||||
#define INT_REMAP_ENTRY_SUPIOPF_MASK 0x00000002
|
||||
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
|
||||
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
|
||||
Index: xen-4.3.1-testing/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
|
||||
===================================================================
|
||||
--- xen-4.3.1-testing.orig/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
|
||||
+++ xen-4.3.1-testing/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
|
||||
@@ -89,10 +89,12 @@ struct amd_iommu *find_iommu_for_device(
|
||||
|
||||
/* interrupt remapping */
|
||||
|
@ -1,44 +0,0 @@
|
||||
# Commit 85047d9e4f4afeb73bca1e98f705a2f4f1d51c03
|
||||
# Date 2013-07-17 08:45:20 +0200
|
||||
# Author Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/cpuidle: Change logging for unknown APIC IDs
|
||||
|
||||
Dom0 uses this hypercall to pass ACPI information to Xen. It is not very
|
||||
uncommon for more cpus to be listed in the ACPI tables than are present on the
|
||||
system, particularly on systems with a common BIOS for a 2 and 4 socket server
|
||||
varients.
|
||||
|
||||
As Dom0 does not control the number of entries in the ACPI tables, and is
|
||||
required to pass everything it finds to Xen, change the logging.
|
||||
|
||||
There is now an single unconditional warning for the first unknown ID, and
|
||||
further warnings if "cpuinfo" is requested by the user on the command line.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/acpi/cpu_idle.c
|
||||
+++ b/xen/arch/x86/acpi/cpu_idle.c
|
||||
@@ -1031,7 +1031,10 @@ long set_cx_pminfo(uint32_t cpu, struct
|
||||
cpu_id = get_cpu_id(cpu);
|
||||
if ( cpu_id == -1 )
|
||||
{
|
||||
- printk(XENLOG_ERR "no cpu_id for acpi_id %d\n", cpu);
|
||||
+ static bool_t warn_once = 1;
|
||||
+ if ( warn_once || opt_cpu_info )
|
||||
+ printk(XENLOG_WARNING "No CPU ID for APIC ID %#x\n", cpu);
|
||||
+ warn_once = 0;
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
--- a/xen/arch/x86/cpu/common.c
|
||||
+++ b/xen/arch/x86/cpu/common.c
|
||||
@@ -63,7 +63,7 @@ static struct cpu_dev default_cpu = {
|
||||
};
|
||||
static struct cpu_dev * this_cpu = &default_cpu;
|
||||
|
||||
-bool_t __cpuinitdata opt_cpu_info;
|
||||
+bool_t opt_cpu_info;
|
||||
boolean_param("cpuinfo", opt_cpu_info);
|
||||
|
||||
int __cpuinit get_model_name(struct cpuinfo_x86 *c)
|
@ -15,9 +15,11 @@ Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: Yang Zhang <yang.z.zhang@intel.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/vlapic.c
|
||||
+++ b/xen/arch/x86/hvm/vlapic.c
|
||||
@@ -386,6 +386,9 @@ void vlapic_EOI_set(struct vlapic *vlapi
|
||||
Index: xen-4.3.1-testing/xen/arch/x86/hvm/vlapic.c
|
||||
===================================================================
|
||||
--- xen-4.3.1-testing.orig/xen/arch/x86/hvm/vlapic.c
|
||||
+++ xen-4.3.1-testing/xen/arch/x86/hvm/vlapic.c
|
||||
@@ -395,6 +395,9 @@ void vlapic_EOI_set(struct vlapic *vlapi
|
||||
|
||||
vlapic_clear_vector(vector, &vlapic->regs->data[APIC_ISR]);
|
||||
|
||||
@ -27,9 +29,11 @@ Reviewed-by: Yang Zhang <yang.z.zhang@intel.com>
|
||||
if ( vlapic_test_and_clear_vector(vector, &vlapic->regs->data[APIC_TMR]) )
|
||||
vioapic_update_EOI(vlapic_domain(vlapic), vector);
|
||||
|
||||
--- a/xen/arch/x86/hvm/vmx/vmx.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vmx.c
|
||||
@@ -1502,6 +1502,15 @@ static void vmx_sync_pir_to_irr(struct v
|
||||
Index: xen-4.3.1-testing/xen/arch/x86/hvm/vmx/vmx.c
|
||||
===================================================================
|
||||
--- xen-4.3.1-testing.orig/xen/arch/x86/hvm/vmx/vmx.c
|
||||
+++ xen-4.3.1-testing/xen/arch/x86/hvm/vmx/vmx.c
|
||||
@@ -1507,6 +1507,15 @@ static void vmx_sync_pir_to_irr(struct v
|
||||
vlapic_set_vector(i, &vlapic->regs->data[APIC_IRR]);
|
||||
}
|
||||
|
||||
@ -45,7 +49,7 @@ Reviewed-by: Yang Zhang <yang.z.zhang@intel.com>
|
||||
static struct hvm_function_table __initdata vmx_function_table = {
|
||||
.name = "VMX",
|
||||
.cpu_up_prepare = vmx_cpu_up_prepare,
|
||||
@@ -1554,6 +1563,7 @@ static struct hvm_function_table __initd
|
||||
@@ -1559,6 +1568,7 @@ static struct hvm_function_table __initd
|
||||
.process_isr = vmx_process_isr,
|
||||
.deliver_posted_intr = vmx_deliver_posted_intr,
|
||||
.sync_pir_to_irr = vmx_sync_pir_to_irr,
|
||||
@ -53,7 +57,7 @@ Reviewed-by: Yang Zhang <yang.z.zhang@intel.com>
|
||||
.nhvm_hap_walk_L1_p2m = nvmx_hap_walk_L1_p2m,
|
||||
};
|
||||
|
||||
@@ -1580,7 +1590,10 @@ const struct hvm_function_table * __init
|
||||
@@ -1585,7 +1595,10 @@ const struct hvm_function_table * __init
|
||||
|
||||
setup_ept_dump();
|
||||
}
|
||||
@ -65,8 +69,10 @@ Reviewed-by: Yang Zhang <yang.z.zhang@intel.com>
|
||||
if ( cpu_has_vmx_posted_intr_processing )
|
||||
alloc_direct_apic_vector(&posted_intr_vector, event_check_interrupt);
|
||||
else
|
||||
--- a/xen/include/asm-x86/hvm/hvm.h
|
||||
+++ b/xen/include/asm-x86/hvm/hvm.h
|
||||
Index: xen-4.3.1-testing/xen/include/asm-x86/hvm/hvm.h
|
||||
===================================================================
|
||||
--- xen-4.3.1-testing.orig/xen/include/asm-x86/hvm/hvm.h
|
||||
+++ xen-4.3.1-testing/xen/include/asm-x86/hvm/hvm.h
|
||||
@@ -186,6 +186,7 @@ struct hvm_function_table {
|
||||
void (*process_isr)(int isr, struct vcpu *v);
|
||||
void (*deliver_posted_intr)(struct vcpu *v, u8 vector);
|
||||
|
@ -1,41 +0,0 @@
|
||||
# Commit 68caac7f6f4687241a24e804a9fca19aa26fe183
|
||||
# Date 2013-07-17 10:21:33 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: don't use destroy_xen_mappings() for vunmap()
|
||||
|
||||
Its attempt to tear down intermediate page table levels may race with
|
||||
map_pages_to_xen() establishing them, and now that
|
||||
map_domain_page_global() is backed by vmap() this teardown is also
|
||||
wasteful (as it's very likely to need the same address space populated
|
||||
again within foreseeable time).
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/common/vmap.c
|
||||
+++ b/xen/common/vmap.c
|
||||
@@ -196,9 +196,13 @@ void *vmap(const unsigned long *mfn, uns
|
||||
|
||||
void vunmap(const void *va)
|
||||
{
|
||||
+#ifndef _PAGE_NONE
|
||||
unsigned long addr = (unsigned long)va;
|
||||
|
||||
destroy_xen_mappings(addr, addr + PAGE_SIZE * vm_size(va));
|
||||
+#else /* Avoid tearing down intermediate page tables. */
|
||||
+ map_pages_to_xen((unsigned long)va, 0, vm_size(va), _PAGE_NONE);
|
||||
+#endif
|
||||
vm_free(va);
|
||||
}
|
||||
#endif
|
||||
--- a/xen/include/asm-x86/page.h
|
||||
+++ b/xen/include/asm-x86/page.h
|
||||
@@ -288,6 +288,7 @@ extern l1_pgentry_t l1_identmap[L1_PAGET
|
||||
void paging_init(void);
|
||||
#endif /* !defined(__ASSEMBLY__) */
|
||||
|
||||
+#define _PAGE_NONE _AC(0x000,U)
|
||||
#define _PAGE_PRESENT _AC(0x001,U)
|
||||
#define _PAGE_RW _AC(0x002,U)
|
||||
#define _PAGE_USER _AC(0x004,U)
|
@ -1,24 +0,0 @@
|
||||
# Commit 915a59f25c5eddd86bc2cae6389d0ed2ab87e69e
|
||||
# Date 2013-07-18 09:16:15 +0200
|
||||
# Author Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/time: Update wallclock in shared info when altering domain time offset
|
||||
|
||||
domain_set_time_offset() udpates d->time_offset_seconds, but does not correct
|
||||
the wallclock in the shared info, meaning that it is incorrect until the next
|
||||
XENPF_settime hypercall from dom0 which resynchronises the wallclock for all
|
||||
domains.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/time.c
|
||||
+++ b/xen/arch/x86/time.c
|
||||
@@ -931,6 +931,7 @@ void domain_set_time_offset(struct domai
|
||||
d->time_offset_seconds = time_offset_seconds;
|
||||
if ( is_hvm_domain(d) )
|
||||
rtc_update_clock(d);
|
||||
+ update_domain_wallclock_time(d);
|
||||
}
|
||||
|
||||
int cpu_frequency_change(u64 freq)
|
@ -1,62 +0,0 @@
|
||||
# Commit b0e55bd49725c7c0183eb18670997b9e5930adac
|
||||
# Date 2013-08-05 18:40:23 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
fix off-by-one mistakes in vm_alloc()
|
||||
|
||||
Also add another pair of assertions to catch eventual further cases of
|
||||
incorrect accounting.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/common/vmap.c
|
||||
+++ b/xen/common/vmap.c
|
||||
@@ -57,8 +57,8 @@ void *vm_alloc(unsigned int nr, unsigned
|
||||
{
|
||||
struct page_info *pg;
|
||||
|
||||
- ASSERT(!test_bit(vm_low, vm_bitmap));
|
||||
- for ( start = vm_low; ; )
|
||||
+ ASSERT(vm_low == vm_top || !test_bit(vm_low, vm_bitmap));
|
||||
+ for ( start = vm_low; start < vm_top; )
|
||||
{
|
||||
bit = find_next_bit(vm_bitmap, vm_top, start + 1);
|
||||
if ( bit > vm_top )
|
||||
@@ -68,12 +68,18 @@ void *vm_alloc(unsigned int nr, unsigned
|
||||
* corresponding page a guard one.
|
||||
*/
|
||||
start = (start + align) & ~(align - 1);
|
||||
- if ( start + nr <= bit )
|
||||
- break;
|
||||
- start = bit < vm_top ?
|
||||
- find_next_zero_bit(vm_bitmap, vm_top, bit + 1) : bit;
|
||||
- if ( start >= vm_top )
|
||||
- break;
|
||||
+ if ( bit < vm_top )
|
||||
+ {
|
||||
+ if ( start + nr < bit )
|
||||
+ break;
|
||||
+ start = find_next_zero_bit(vm_bitmap, vm_top, bit + 1);
|
||||
+ }
|
||||
+ else
|
||||
+ {
|
||||
+ if ( start + nr <= bit )
|
||||
+ break;
|
||||
+ start = bit;
|
||||
+ }
|
||||
}
|
||||
|
||||
if ( start < vm_top )
|
||||
@@ -115,6 +121,10 @@ void *vm_alloc(unsigned int nr, unsigned
|
||||
|
||||
for ( bit = start; bit < start + nr; ++bit )
|
||||
__set_bit(bit, vm_bitmap);
|
||||
+ if ( bit < vm_top )
|
||||
+ ASSERT(!test_bit(bit, vm_bitmap));
|
||||
+ else
|
||||
+ ASSERT(bit == vm_top);
|
||||
if ( start <= vm_low + 2 )
|
||||
vm_low = bit;
|
||||
spin_unlock(&vm_lock);
|
@ -1,60 +0,0 @@
|
||||
# Commit c58d9f2f4844c2ce8859a8d0f26a54cd058eb51f
|
||||
# Date 2013-08-05 18:42:37 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: refine FPU selector handling code for XSAVEOPT
|
||||
|
||||
Some extra tweaks are necessary to deal with the situation of XSAVEOPT
|
||||
not writing the FPU portion of the save image (due to it detecting that
|
||||
the register state did not get modified since the last XRSTOR).
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Tested-by: Ben Guthro <ben.guthro@gmail.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/xstate.c
|
||||
+++ b/xen/arch/x86/xstate.c
|
||||
@@ -71,10 +71,28 @@ void xsave(struct vcpu *v, uint64_t mask
|
||||
|
||||
if ( word_size <= 0 || !is_pv_32bit_vcpu(v) )
|
||||
{
|
||||
+ typeof(ptr->fpu_sse.fip.sel) fcs = ptr->fpu_sse.fip.sel;
|
||||
+ typeof(ptr->fpu_sse.fdp.sel) fds = ptr->fpu_sse.fdp.sel;
|
||||
+
|
||||
if ( cpu_has_xsaveopt )
|
||||
+ {
|
||||
+ /*
|
||||
+ * xsaveopt may not write the FPU portion even when the respective
|
||||
+ * mask bit is set. For the check further down to work we hence
|
||||
+ * need to put the save image back into the state that it was in
|
||||
+ * right after the previous xsaveopt.
|
||||
+ */
|
||||
+ if ( word_size > 0 &&
|
||||
+ (ptr->fpu_sse.x[FPU_WORD_SIZE_OFFSET] == 4 ||
|
||||
+ ptr->fpu_sse.x[FPU_WORD_SIZE_OFFSET] == 2) )
|
||||
+ {
|
||||
+ ptr->fpu_sse.fip.sel = 0;
|
||||
+ ptr->fpu_sse.fdp.sel = 0;
|
||||
+ }
|
||||
asm volatile ( ".byte 0x48,0x0f,0xae,0x37"
|
||||
: "=m" (*ptr)
|
||||
: "a" (lmask), "d" (hmask), "D" (ptr) );
|
||||
+ }
|
||||
else
|
||||
asm volatile ( ".byte 0x48,0x0f,0xae,0x27"
|
||||
: "=m" (*ptr)
|
||||
@@ -87,7 +105,14 @@ void xsave(struct vcpu *v, uint64_t mask
|
||||
*/
|
||||
(!(ptr->fpu_sse.fsw & 0x0080) &&
|
||||
boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
|
||||
+ {
|
||||
+ if ( cpu_has_xsaveopt && word_size > 0 )
|
||||
+ {
|
||||
+ ptr->fpu_sse.fip.sel = fcs;
|
||||
+ ptr->fpu_sse.fdp.sel = fds;
|
||||
+ }
|
||||
return;
|
||||
+ }
|
||||
|
||||
if ( word_size > 0 &&
|
||||
!((ptr->fpu_sse.fip.addr | ptr->fpu_sse.fdp.addr) >> 32) )
|
@ -1,23 +0,0 @@
|
||||
# Commit e1ab5c77b44b7bd835a2c032fa4963b36545fdb3
|
||||
# Date 2013-08-06 17:22:35 +0200
|
||||
# Author Yang Zhang <yang.z.zhang@Intel.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
Nested VMX: Flush TLBs and Caches if paging mode changed
|
||||
|
||||
According to SDM, if paging mode is changed, then whole TLBs and caches will
|
||||
be flushed. This is missed in nested handle logic. Also this fixed the issue
|
||||
that 64 bits windows cannot boot up on top of L1 kvm.
|
||||
|
||||
Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/mm/paging.c
|
||||
+++ b/xen/arch/x86/mm/paging.c
|
||||
@@ -709,6 +709,7 @@ void paging_update_nestedmode(struct vcp
|
||||
else
|
||||
/* TODO: shadow-on-shadow */
|
||||
v->arch.paging.nestedmode = NULL;
|
||||
+ hvm_asid_flush_vcpu(v);
|
||||
}
|
||||
|
||||
void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
|
@ -1,40 +0,0 @@
|
||||
# Commit 66450c1d1ab3c4480bbba949113b95d1ab6a943a
|
||||
# Date 2013-08-06 17:45:00 +0200
|
||||
# Author Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
xen/conring: Write to console ring even if console lock is busted
|
||||
|
||||
console_lock_busted gets set when an NMI/MCE/Double Fault handler decides to
|
||||
bring Xen down in an emergency. conring_puts() cannot block and does
|
||||
not have problematic interactions with the console_lock.
|
||||
|
||||
Therefore, choosing to not put the string into the console ring simply means
|
||||
that the kexec environment cant find any panic() message caused by an IST
|
||||
interrupt, which is unhelpful for debugging purposes.
|
||||
|
||||
In the case that two pcpus fight with console_force_unlock(), having slightly
|
||||
garbled strings in the console ring is far more useful than having nothing at
|
||||
all.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Matt Wilson <msw@amazon.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/drivers/char/console.c
|
||||
+++ b/xen/drivers/char/console.c
|
||||
@@ -463,12 +463,11 @@ static void __putstr(const char *str)
|
||||
sercon_puts(str);
|
||||
video_puts(str);
|
||||
|
||||
+ while ( (c = *str++) != '\0' )
|
||||
+ putchar_console_ring(c);
|
||||
+
|
||||
if ( !console_locks_busted )
|
||||
- {
|
||||
- while ( (c = *str++) != '\0' )
|
||||
- putchar_console_ring(c);
|
||||
tasklet_schedule(¬ify_dom0_con_ring_tasklet);
|
||||
- }
|
||||
}
|
||||
|
||||
static int printk_prefix_check(char *p, char **pp)
|
@ -1,138 +0,0 @@
|
||||
# Commit 85fc517ec3055e8e8d9c9e36e15a81e630237252
|
||||
# Date 2013-08-13 14:22:14 +0200
|
||||
# Author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/AMD: Fix nested svm crash due to assertion in __virt_to_maddr
|
||||
|
||||
Fix assertion in __virt_to_maddr when starting nested SVM guest
|
||||
in debug mode. Investigation has shown that svm_vmsave/svm_vmload
|
||||
make use of __pa() with invalid address.
|
||||
|
||||
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
Reviewed-by: Tim Deegan <tim@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/hvm/svm/svm.c
|
||||
+++ b/xen/arch/x86/hvm/svm/svm.c
|
||||
@@ -1792,6 +1792,32 @@ svm_vmexit_do_vmrun(struct cpu_user_regs
|
||||
return;
|
||||
}
|
||||
|
||||
+static struct page_info *
|
||||
+nsvm_get_nvmcb_page(struct vcpu *v, uint64_t vmcbaddr)
|
||||
+{
|
||||
+ p2m_type_t p2mt;
|
||||
+ struct page_info *page;
|
||||
+ struct nestedvcpu *nv = &vcpu_nestedhvm(v);
|
||||
+
|
||||
+ if ( !nestedsvm_vmcb_map(v, vmcbaddr) )
|
||||
+ return NULL;
|
||||
+
|
||||
+ /* Need to translate L1-GPA to MPA */
|
||||
+ page = get_page_from_gfn(v->domain,
|
||||
+ nv->nv_vvmcxaddr >> PAGE_SHIFT,
|
||||
+ &p2mt, P2M_ALLOC | P2M_UNSHARE);
|
||||
+ if ( !page )
|
||||
+ return NULL;
|
||||
+
|
||||
+ if ( !p2m_is_ram(p2mt) || p2m_is_readonly(p2mt) )
|
||||
+ {
|
||||
+ put_page(page);
|
||||
+ return NULL;
|
||||
+ }
|
||||
+
|
||||
+ return page;
|
||||
+}
|
||||
+
|
||||
static void
|
||||
svm_vmexit_do_vmload(struct vmcb_struct *vmcb,
|
||||
struct cpu_user_regs *regs,
|
||||
@@ -1799,7 +1825,7 @@ svm_vmexit_do_vmload(struct vmcb_struct
|
||||
{
|
||||
int ret;
|
||||
unsigned int inst_len;
|
||||
- struct nestedvcpu *nv = &vcpu_nestedhvm(v);
|
||||
+ struct page_info *page;
|
||||
|
||||
if ( (inst_len = __get_instruction_length(v, INSTR_VMLOAD)) == 0 )
|
||||
return;
|
||||
@@ -1810,13 +1836,18 @@ svm_vmexit_do_vmload(struct vmcb_struct
|
||||
goto inject;
|
||||
}
|
||||
|
||||
- if (!nestedsvm_vmcb_map(v, vmcbaddr)) {
|
||||
- gdprintk(XENLOG_ERR, "VMLOAD: mapping vmcb failed, injecting #UD\n");
|
||||
+ page = nsvm_get_nvmcb_page(v, vmcbaddr);
|
||||
+ if ( !page )
|
||||
+ {
|
||||
+ gdprintk(XENLOG_ERR,
|
||||
+ "VMLOAD: mapping failed, injecting #UD\n");
|
||||
ret = TRAP_invalid_op;
|
||||
goto inject;
|
||||
}
|
||||
|
||||
- svm_vmload(nv->nv_vvmcx);
|
||||
+ svm_vmload_pa(page_to_maddr(page));
|
||||
+ put_page(page);
|
||||
+
|
||||
/* State in L1 VMCB is stale now */
|
||||
v->arch.hvm_svm.vmcb_in_sync = 0;
|
||||
|
||||
@@ -1835,7 +1866,7 @@ svm_vmexit_do_vmsave(struct vmcb_struct
|
||||
{
|
||||
int ret;
|
||||
unsigned int inst_len;
|
||||
- struct nestedvcpu *nv = &vcpu_nestedhvm(v);
|
||||
+ struct page_info *page;
|
||||
|
||||
if ( (inst_len = __get_instruction_length(v, INSTR_VMSAVE)) == 0 )
|
||||
return;
|
||||
@@ -1846,14 +1877,17 @@ svm_vmexit_do_vmsave(struct vmcb_struct
|
||||
goto inject;
|
||||
}
|
||||
|
||||
- if (!nestedsvm_vmcb_map(v, vmcbaddr)) {
|
||||
- gdprintk(XENLOG_ERR, "VMSAVE: mapping vmcb failed, injecting #UD\n");
|
||||
+ page = nsvm_get_nvmcb_page(v, vmcbaddr);
|
||||
+ if ( !page )
|
||||
+ {
|
||||
+ gdprintk(XENLOG_ERR,
|
||||
+ "VMSAVE: mapping vmcb failed, injecting #UD\n");
|
||||
ret = TRAP_invalid_op;
|
||||
goto inject;
|
||||
}
|
||||
|
||||
- svm_vmsave(nv->nv_vvmcx);
|
||||
-
|
||||
+ svm_vmsave_pa(page_to_maddr(page));
|
||||
+ put_page(page);
|
||||
__update_guest_eip(regs, inst_len);
|
||||
return;
|
||||
|
||||
--- a/xen/include/asm-x86/hvm/svm/svm.h
|
||||
+++ b/xen/include/asm-x86/hvm/svm/svm.h
|
||||
@@ -41,18 +41,21 @@
|
||||
#define SVM_REG_R14 (14)
|
||||
#define SVM_REG_R15 (15)
|
||||
|
||||
-static inline void svm_vmload(void *vmcb)
|
||||
+#define svm_vmload(x) svm_vmload_pa(__pa(x))
|
||||
+#define svm_vmsave(x) svm_vmsave_pa(__pa(x))
|
||||
+
|
||||
+static inline void svm_vmload_pa(paddr_t vmcb)
|
||||
{
|
||||
asm volatile (
|
||||
".byte 0x0f,0x01,0xda" /* vmload */
|
||||
- : : "a" (__pa(vmcb)) : "memory" );
|
||||
+ : : "a" (vmcb) : "memory" );
|
||||
}
|
||||
|
||||
-static inline void svm_vmsave(void *vmcb)
|
||||
+static inline void svm_vmsave_pa(paddr_t vmcb)
|
||||
{
|
||||
asm volatile (
|
||||
".byte 0x0f,0x01,0xdb" /* vmsave */
|
||||
- : : "a" (__pa(vmcb)) : "memory" );
|
||||
+ : : "a" (vmcb) : "memory" );
|
||||
}
|
||||
|
||||
static inline void svm_invlpga(unsigned long vaddr, uint32_t asid)
|
@ -1,91 +0,0 @@
|
||||
# Commit 910daaf5aaa837624099c0fc5c373bea7202ff43
|
||||
# Date 2013-08-13 14:24:16 +0200
|
||||
# Author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/AMD: Inject #GP instead of #UD when unable to map vmcb
|
||||
|
||||
According to AMD Programmer's Manual vol2, vmrun, vmsave and vmload
|
||||
should inject #GP instead of #UD when unable to access memory
|
||||
location for vmcb. Also, the code should make sure that L1 guest
|
||||
EFER.SVME is not zero. Otherwise, #UD should be injected.
|
||||
|
||||
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
Reviewed-by: Tim Deegan <tim@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/hvm/svm/svm.c
|
||||
+++ b/xen/arch/x86/hvm/svm/svm.c
|
||||
@@ -1776,15 +1776,17 @@ static void
|
||||
svm_vmexit_do_vmrun(struct cpu_user_regs *regs,
|
||||
struct vcpu *v, uint64_t vmcbaddr)
|
||||
{
|
||||
- if (!nestedhvm_enabled(v->domain)) {
|
||||
+ if ( !nsvm_efer_svm_enabled(v) )
|
||||
+ {
|
||||
gdprintk(XENLOG_ERR, "VMRUN: nestedhvm disabled, injecting #UD\n");
|
||||
hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
|
||||
return;
|
||||
}
|
||||
|
||||
- if (!nestedsvm_vmcb_map(v, vmcbaddr)) {
|
||||
- gdprintk(XENLOG_ERR, "VMRUN: mapping vmcb failed, injecting #UD\n");
|
||||
- hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
|
||||
+ if ( !nestedsvm_vmcb_map(v, vmcbaddr) )
|
||||
+ {
|
||||
+ gdprintk(XENLOG_ERR, "VMRUN: mapping vmcb failed, injecting #GP\n");
|
||||
+ hvm_inject_hw_exception(TRAP_gp_fault, HVM_DELIVER_NO_ERROR_CODE);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1830,7 +1832,8 @@ svm_vmexit_do_vmload(struct vmcb_struct
|
||||
if ( (inst_len = __get_instruction_length(v, INSTR_VMLOAD)) == 0 )
|
||||
return;
|
||||
|
||||
- if (!nestedhvm_enabled(v->domain)) {
|
||||
+ if ( !nsvm_efer_svm_enabled(v) )
|
||||
+ {
|
||||
gdprintk(XENLOG_ERR, "VMLOAD: nestedhvm disabled, injecting #UD\n");
|
||||
ret = TRAP_invalid_op;
|
||||
goto inject;
|
||||
@@ -1840,8 +1843,8 @@ svm_vmexit_do_vmload(struct vmcb_struct
|
||||
if ( !page )
|
||||
{
|
||||
gdprintk(XENLOG_ERR,
|
||||
- "VMLOAD: mapping failed, injecting #UD\n");
|
||||
- ret = TRAP_invalid_op;
|
||||
+ "VMLOAD: mapping failed, injecting #GP\n");
|
||||
+ ret = TRAP_gp_fault;
|
||||
goto inject;
|
||||
}
|
||||
|
||||
@@ -1871,7 +1874,8 @@ svm_vmexit_do_vmsave(struct vmcb_struct
|
||||
if ( (inst_len = __get_instruction_length(v, INSTR_VMSAVE)) == 0 )
|
||||
return;
|
||||
|
||||
- if (!nestedhvm_enabled(v->domain)) {
|
||||
+ if ( !nsvm_efer_svm_enabled(v) )
|
||||
+ {
|
||||
gdprintk(XENLOG_ERR, "VMSAVE: nestedhvm disabled, injecting #UD\n");
|
||||
ret = TRAP_invalid_op;
|
||||
goto inject;
|
||||
@@ -1881,8 +1885,8 @@ svm_vmexit_do_vmsave(struct vmcb_struct
|
||||
if ( !page )
|
||||
{
|
||||
gdprintk(XENLOG_ERR,
|
||||
- "VMSAVE: mapping vmcb failed, injecting #UD\n");
|
||||
- ret = TRAP_invalid_op;
|
||||
+ "VMSAVE: mapping vmcb failed, injecting #GP\n");
|
||||
+ ret = TRAP_gp_fault;
|
||||
goto inject;
|
||||
}
|
||||
|
||||
--- a/xen/include/asm-x86/hvm/svm/nestedsvm.h
|
||||
+++ b/xen/include/asm-x86/hvm/svm/nestedsvm.h
|
||||
@@ -94,7 +94,7 @@ struct nestedsvm {
|
||||
#define vcpu_nestedsvm(v) (vcpu_nestedhvm(v).u.nsvm)
|
||||
|
||||
/* True when l1 guest enabled SVM in EFER */
|
||||
-#define hvm_svm_enabled(v) \
|
||||
+#define nsvm_efer_svm_enabled(v) \
|
||||
(!!((v)->arch.hvm_vcpu.guest_efer & EFER_SVME))
|
||||
|
||||
int nestedsvm_vmcb_map(struct vcpu *v, uint64_t vmcbaddr);
|
@ -1,49 +0,0 @@
|
||||
# Commit 7b9fa702ca323164d6b49e8b639a57f880454a8c
|
||||
# Date 2013-08-13 14:31:01 +0200
|
||||
# Author Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
watchdog/crash: Always disable watchdog in console_force_unlock()
|
||||
|
||||
Depending on the state of the conring and serial_tx_buffer,
|
||||
console_force_unlock() can be a long running operation, usually because of
|
||||
serial_start_sync()
|
||||
|
||||
XenServer testing has found a reliable case where console_force_unlock() on
|
||||
one PCPU takes long enough for another PCPU to timeout due to the watchdog
|
||||
(such as waiting for a tlb flush callin).
|
||||
|
||||
The watchdog timeout causes the second PCPU to repeat the
|
||||
console_force_unlock(), at which point the first PCPU typically fails an
|
||||
assertion in spin_unlock_irqrestore(&port->tx_lock) (because the tx_lock has
|
||||
been unlocked behind itself).
|
||||
|
||||
console_force_unlock() is only on emergency paths, so one way or another the
|
||||
host is going down. Disable the watchdog before forcing the console lock to
|
||||
help prevent having pcpus completing with each other to bring the host down.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/x86_64/traps.c
|
||||
+++ b/xen/arch/x86/x86_64/traps.c
|
||||
@@ -226,8 +226,6 @@ void do_double_fault(struct cpu_user_reg
|
||||
unsigned int cpu;
|
||||
unsigned long crs[8];
|
||||
|
||||
- watchdog_disable();
|
||||
-
|
||||
console_force_unlock();
|
||||
|
||||
asm ( "lsll %1, %0" : "=r" (cpu) : "rm" (PER_CPU_GDT_ENTRY << 3) );
|
||||
--- a/xen/drivers/char/console.c
|
||||
+++ b/xen/drivers/char/console.c
|
||||
@@ -736,6 +736,9 @@ void console_end_log_everything(void)
|
||||
|
||||
void console_force_unlock(void)
|
||||
{
|
||||
+#ifdef CONFIG_X86
|
||||
+ watchdog_disable();
|
||||
+#endif
|
||||
spin_lock_init(&console_lock);
|
||||
serial_force_unlock(sercon_handle);
|
||||
console_locks_busted = 1;
|
@ -1,38 +0,0 @@
|
||||
# Commit 0c006b41a283a0a569c863d44abde5aa5750ae01
|
||||
# Date 2013-08-13 17:47:16 +0200
|
||||
# Author Yang Zhang <yang.z.zhang@Intel.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
VMX: add boot parameter to enable/disable APIC-v dynamically
|
||||
|
||||
Add a boot parameter to enable/disable the APIC-v dynamically. APIC-v is
|
||||
enabled by default. User can use apicv=0 to disable it.
|
||||
|
||||
Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/vmx/vmcs.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
|
||||
@@ -46,6 +46,9 @@ boolean_param("vpid", opt_vpid_enabled);
|
||||
static bool_t __read_mostly opt_unrestricted_guest_enabled = 1;
|
||||
boolean_param("unrestricted_guest", opt_unrestricted_guest_enabled);
|
||||
|
||||
+static bool_t __read_mostly opt_apicv_enabled = 1;
|
||||
+boolean_param("apicv", opt_apicv_enabled);
|
||||
+
|
||||
/*
|
||||
* These two parameters are used to config the controls for Pause-Loop Exiting:
|
||||
* ple_gap: upper bound on the amount of time between two successive
|
||||
@@ -196,12 +199,12 @@ static int vmx_init_vmcs_config(void)
|
||||
* "APIC Register Virtualization" and "Virtual Interrupt Delivery"
|
||||
* can be set only when "use TPR shadow" is set
|
||||
*/
|
||||
- if ( _vmx_cpu_based_exec_control & CPU_BASED_TPR_SHADOW )
|
||||
+ if ( (_vmx_cpu_based_exec_control & CPU_BASED_TPR_SHADOW) &&
|
||||
+ opt_apicv_enabled )
|
||||
opt |= SECONDARY_EXEC_APIC_REGISTER_VIRT |
|
||||
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
|
||||
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
|
||||
|
||||
-
|
||||
_vmx_secondary_exec_control = adjust_vmx_controls(
|
||||
"Secondary Exec Control", min, opt,
|
||||
MSR_IA32_VMX_PROCBASED_CTLS2, &mismatch);
|
@ -1,41 +0,0 @@
|
||||
# Commit e8e8b030ecf916fea19639f0b6a446c1c9dbe174
|
||||
# Date 2013-08-14 11:18:24 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
VT-d: protect against bogus information coming from BIOS
|
||||
|
||||
Add checks similar to those done by Linux: The DRHD address must not
|
||||
be all zeros or all ones (Linux only checks for zero), and capabilities
|
||||
as well as extended capabilities must not be all ones.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Ben Guthro <benjamin.guthro@citrix.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Tested-by: Ben Guthro <benjamin.guthro@citrix.com>
|
||||
Acked by: Yang Zhang <yang.z.zhang@intel.com>
|
||||
Acked-by: Xiantao Zhang <xiantao.zhang@intel.com>
|
||||
|
||||
--- a/xen/drivers/passthrough/vtd/dmar.c
|
||||
+++ b/xen/drivers/passthrough/vtd/dmar.c
|
||||
@@ -447,6 +447,9 @@ acpi_parse_one_drhd(struct acpi_dmar_hea
|
||||
if ( (ret = acpi_dmar_check_length(header, sizeof(*drhd))) != 0 )
|
||||
return ret;
|
||||
|
||||
+ if ( !drhd->address || !(drhd->address + 1) )
|
||||
+ return -ENODEV;
|
||||
+
|
||||
dmaru = xzalloc(struct acpi_drhd_unit);
|
||||
if ( !dmaru )
|
||||
return -ENOMEM;
|
||||
--- a/xen/drivers/passthrough/vtd/iommu.c
|
||||
+++ b/xen/drivers/passthrough/vtd/iommu.c
|
||||
@@ -1159,6 +1159,9 @@ int __init iommu_alloc(struct acpi_drhd_
|
||||
dprintk(VTDPREFIX,
|
||||
"cap = %"PRIx64" ecap = %"PRIx64"\n", iommu->cap, iommu->ecap);
|
||||
}
|
||||
+ if ( !(iommu->cap + 1) || !(iommu->ecap + 1) )
|
||||
+ return -ENODEV;
|
||||
+
|
||||
if ( cap_fault_reg_offset(iommu->cap) +
|
||||
cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE ||
|
||||
ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE )
|
@ -1,24 +0,0 @@
|
||||
# Commit f67af6d5803b6a015e30cb490a94f9547cb0437c
|
||||
# Date 2013-08-14 11:20:26 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/MTRR: fix range check in mtrr_add_page()
|
||||
|
||||
Extracted from Yinghai Lu's Linux commit d5c78673 ("x86: Fix /proc/mtrr
|
||||
with base/size more than 44bits").
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/cpu/mtrr/main.c
|
||||
+++ b/xen/arch/x86/cpu/mtrr/main.c
|
||||
@@ -340,7 +340,7 @@ int mtrr_add_page(unsigned long base, un
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
- if (base & size_or_mask || size & size_or_mask) {
|
||||
+ if ((base | (base + size - 1)) >> (paddr_bits - PAGE_SHIFT)) {
|
||||
printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n");
|
||||
return -EINVAL;
|
||||
}
|
@ -1,22 +0,0 @@
|
||||
# Commit ab7f9a793c78dfea81c037b34b0dd2db7070d8f8
|
||||
# Date 2013-08-15 13:17:10 +0200
|
||||
# Author Tim Deegan <tim@xen.org>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/time: fix check for negative time in __update_vcpu_system_time()
|
||||
|
||||
Clang points out that u64 stime variable is always >= 0.
|
||||
|
||||
Signed-off-by: Tim Deegan <tim@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/time.c
|
||||
+++ b/xen/arch/x86/time.c
|
||||
@@ -817,7 +817,8 @@ static void __update_vcpu_system_time(st
|
||||
|
||||
if ( d->arch.vtsc )
|
||||
{
|
||||
- u64 stime = t->stime_local_stamp;
|
||||
+ s_time_t stime = t->stime_local_stamp;
|
||||
+
|
||||
if ( is_hvm_domain(d) )
|
||||
{
|
||||
struct pl_time *pl = &v->domain->arch.hvm_domain.pl_time;
|
@ -1,132 +0,0 @@
|
||||
References: bnc#833251, bnc#834751
|
||||
|
||||
# Commit 2ee9cbf9d8eaeff6e21222905d22dbd58dc5fe29
|
||||
# Date 2013-08-21 08:38:40 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
ACPI: fix acpi_os_map_memory()
|
||||
|
||||
It using map_domain_page() was entirely wrong. Use __acpi_map_table()
|
||||
instead for the time being, with locking added as the mappings it
|
||||
produces get replaced with subsequent invocations. Using locking in
|
||||
this way is acceptable here since the only two runtime callers are
|
||||
acpi_os_{read,write}_memory(), which don't leave mappings pending upon
|
||||
returning to their callers.
|
||||
|
||||
Also fix __acpi_map_table()'s first parameter's type - while benign for
|
||||
unstable, backports to pre-4.3 trees will need this.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
# Commit c5ba8ed4c6f005d332a49d93a3ef8ff2b690b256
|
||||
# Date 2013-08-21 08:40:22 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
ACPI: use ioremap() in acpi_os_map_memory()
|
||||
|
||||
This drops the post-boot use of __acpi_map_table() here again (together
|
||||
with the somewhat awkward locking), in favor of using ioremap().
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/xen/arch/x86/acpi/lib.c
|
||||
+++ b/xen/arch/x86/acpi/lib.c
|
||||
@@ -39,7 +39,7 @@ u32 __read_mostly x86_acpiid_to_apicid[M
|
||||
* from the fixed base. That's why we start at FIX_ACPI_END and
|
||||
* count idx down while incrementing the phys address.
|
||||
*/
|
||||
-char *__acpi_map_table(unsigned long phys, unsigned long size)
|
||||
+char *__acpi_map_table(paddr_t phys, unsigned long size)
|
||||
{
|
||||
unsigned long base, offset, mapped_size;
|
||||
int idx;
|
||||
--- a/xen/drivers/acpi/osl.c
|
||||
+++ b/xen/drivers/acpi/osl.c
|
||||
@@ -38,6 +38,7 @@
|
||||
#include <xen/spinlock.h>
|
||||
#include <xen/domain_page.h>
|
||||
#include <xen/efi.h>
|
||||
+#include <xen/vmap.h>
|
||||
|
||||
#define _COMPONENT ACPI_OS_SERVICES
|
||||
ACPI_MODULE_NAME("osl")
|
||||
@@ -83,14 +84,25 @@ acpi_physical_address __init acpi_os_get
|
||||
}
|
||||
}
|
||||
|
||||
-void __iomem *__init
|
||||
+void __iomem *
|
||||
acpi_os_map_memory(acpi_physical_address phys, acpi_size size)
|
||||
{
|
||||
- return __acpi_map_table((unsigned long)phys, size);
|
||||
+ if (system_state >= SYS_STATE_active) {
|
||||
+ unsigned long pfn = PFN_DOWN(phys);
|
||||
+ unsigned int offs = phys & (PAGE_SIZE - 1);
|
||||
+
|
||||
+ /* The low first Mb is always mapped. */
|
||||
+ if ( !((phys + size - 1) >> 20) )
|
||||
+ return __va(phys);
|
||||
+ return __vmap(&pfn, PFN_UP(offs + size), 1, 1, PAGE_HYPERVISOR_NOCACHE) + offs;
|
||||
+ }
|
||||
+ return __acpi_map_table(phys, size);
|
||||
}
|
||||
|
||||
-void __init acpi_os_unmap_memory(void __iomem * virt, acpi_size size)
|
||||
+void acpi_os_unmap_memory(void __iomem * virt, acpi_size size)
|
||||
{
|
||||
+ if (system_state >= SYS_STATE_active)
|
||||
+ vunmap((void *)((unsigned long)virt & PAGE_MASK));
|
||||
}
|
||||
|
||||
acpi_status acpi_os_read_port(acpi_io_address port, u32 * value, u32 width)
|
||||
@@ -133,9 +145,8 @@ acpi_status
|
||||
acpi_os_read_memory(acpi_physical_address phys_addr, u32 * value, u32 width)
|
||||
{
|
||||
u32 dummy;
|
||||
- void __iomem *virt_addr;
|
||||
+ void __iomem *virt_addr = acpi_os_map_memory(phys_addr, width >> 3);
|
||||
|
||||
- virt_addr = map_domain_page(phys_addr>>PAGE_SHIFT);
|
||||
if (!value)
|
||||
value = &dummy;
|
||||
|
||||
@@ -153,7 +164,7 @@ acpi_os_read_memory(acpi_physical_addres
|
||||
BUG();
|
||||
}
|
||||
|
||||
- unmap_domain_page(virt_addr);
|
||||
+ acpi_os_unmap_memory(virt_addr, width >> 3);
|
||||
|
||||
return AE_OK;
|
||||
}
|
||||
@@ -161,9 +172,7 @@ acpi_os_read_memory(acpi_physical_addres
|
||||
acpi_status
|
||||
acpi_os_write_memory(acpi_physical_address phys_addr, u32 value, u32 width)
|
||||
{
|
||||
- void __iomem *virt_addr;
|
||||
-
|
||||
- virt_addr = map_domain_page(phys_addr>>PAGE_SHIFT);
|
||||
+ void __iomem *virt_addr = acpi_os_map_memory(phys_addr, width >> 3);
|
||||
|
||||
switch (width) {
|
||||
case 8:
|
||||
@@ -179,7 +188,7 @@ acpi_os_write_memory(acpi_physical_addre
|
||||
BUG();
|
||||
}
|
||||
|
||||
- unmap_domain_page(virt_addr);
|
||||
+ acpi_os_unmap_memory(virt_addr, width >> 3);
|
||||
|
||||
return AE_OK;
|
||||
}
|
||||
--- a/xen/include/xen/acpi.h
|
||||
+++ b/xen/include/xen/acpi.h
|
||||
@@ -56,7 +56,7 @@ typedef int (*acpi_table_handler) (struc
|
||||
typedef int (*acpi_table_entry_handler) (struct acpi_subtable_header *header, const unsigned long end);
|
||||
|
||||
unsigned int acpi_get_processor_id (unsigned int cpu);
|
||||
-char * __acpi_map_table (unsigned long phys_addr, unsigned long size);
|
||||
+char * __acpi_map_table (paddr_t phys_addr, unsigned long size);
|
||||
int acpi_boot_init (void);
|
||||
int acpi_boot_table_init (void);
|
||||
int acpi_numa_init (void);
|
@ -1,50 +0,0 @@
|
||||
# Commit c9c6abab583d27fdca1d979a7f1d18ae30f54e9b
|
||||
# Date 2013-08-21 16:44:58 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
VT-d: warn about Compatibility Format Interrupts being enabled by firmware
|
||||
|
||||
... as being insecure.
|
||||
|
||||
Also drop the second (redundant) read DMAR_GSTS_REG from enable_intremap().
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by Xiantao Zhang <xiantao.zhang@intel.com>
|
||||
|
||||
--- a/xen/drivers/passthrough/vtd/intremap.c
|
||||
+++ b/xen/drivers/passthrough/vtd/intremap.c
|
||||
@@ -706,8 +706,8 @@ int enable_intremap(struct iommu *iommu,
|
||||
|
||||
if ( !platform_supports_intremap() )
|
||||
{
|
||||
- dprintk(XENLOG_ERR VTDPREFIX,
|
||||
- "Platform firmware does not support interrupt remapping\n");
|
||||
+ printk(XENLOG_ERR VTDPREFIX
|
||||
+ " Platform firmware does not support interrupt remapping\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@@ -718,15 +718,19 @@ int enable_intremap(struct iommu *iommu,
|
||||
if ( (sts & DMA_GSTS_IRES) && ir_ctrl->iremap_maddr )
|
||||
return 0;
|
||||
|
||||
- sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
|
||||
if ( !(sts & DMA_GSTS_QIES) )
|
||||
{
|
||||
- dprintk(XENLOG_ERR VTDPREFIX,
|
||||
- "Queued invalidation is not enabled, should not enable "
|
||||
- "interrupt remapping\n");
|
||||
+ printk(XENLOG_ERR VTDPREFIX
|
||||
+ " Queued invalidation is not enabled on IOMMU #%u:"
|
||||
+ " Should not enable interrupt remapping\n", iommu->index);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
+ if ( !eim && (sts & DMA_GSTS_CFIS) )
|
||||
+ printk(XENLOG_WARNING VTDPREFIX
|
||||
+ " Compatibility Format Interrupts permitted on IOMMU #%u:"
|
||||
+ " Device pass-through will be insecure\n", iommu->index);
|
||||
+
|
||||
if ( ir_ctrl->iremap_maddr == 0 )
|
||||
{
|
||||
drhd = iommu_to_drhd(iommu);
|
@ -1,26 +0,0 @@
|
||||
# Commit 7fb5c6b9ef22915e3fcac95cd44857f4457ba783
|
||||
# Date 2013-08-22 10:49:24 +0200
|
||||
# Author Yang Zhang <yang.z.zhang@Intel.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
Nested VMX: Check whether interrupt is blocked by TPR
|
||||
|
||||
If interrupt is blocked by L1's TPR, L2 should not see it and keep
|
||||
running. Adding the check before L2 to retrive interrupt.
|
||||
|
||||
Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
|
||||
Acked-by: "Dong, Eddie" <eddie.dong@intel.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/vmx/intr.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/intr.c
|
||||
@@ -165,6 +165,11 @@ static int nvmx_intr_intercept(struct vc
|
||||
{
|
||||
u32 ctrl;
|
||||
|
||||
+ /* If blocked by L1's tpr, then nothing to do. */
|
||||
+ if ( nestedhvm_vcpu_in_guestmode(v) &&
|
||||
+ hvm_interrupt_blocked(v, intack) == hvm_intblk_tpr )
|
||||
+ return 1;
|
||||
+
|
||||
if ( nvmx_intr_blocked(v) != hvm_intblk_none )
|
||||
{
|
||||
enable_intr_window(v, intack);
|
@ -1,36 +0,0 @@
|
||||
# Commit b35d0a26983843c092bfa353fd6b9aa8c3bf4886
|
||||
# Date 2013-08-22 10:50:13 +0200
|
||||
# Author Yang Zhang <yang.z.zhang@Intel.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
Nested VMX: Force check ISR when L2 is running
|
||||
|
||||
External interrupt is allowed to notify CPU only when it has higher
|
||||
priority than current in servicing interrupt. With APIC-v, the priority
|
||||
comparing is done by hardware and hardware will inject the interrupt to
|
||||
VCPU when it recognizes an interrupt. Currently, there is no virtual
|
||||
APIC-v feature available for L1 to use, so when L2 is running, we still need
|
||||
to compare interrupt priority with ISR in hypervisor instead via hardware.
|
||||
|
||||
Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
|
||||
Acked-by: "Dong, Eddie" <eddie.dong@intel.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/vlapic.c
|
||||
+++ b/xen/arch/x86/hvm/vlapic.c
|
||||
@@ -37,6 +37,7 @@
|
||||
#include <asm/hvm/io.h>
|
||||
#include <asm/hvm/support.h>
|
||||
#include <asm/hvm/vmx/vmx.h>
|
||||
+#include <asm/hvm/nestedhvm.h>
|
||||
#include <public/hvm/ioreq.h>
|
||||
#include <public/hvm/params.h>
|
||||
|
||||
@@ -1037,7 +1038,8 @@ int vlapic_has_pending_irq(struct vcpu *
|
||||
if ( irr == -1 )
|
||||
return -1;
|
||||
|
||||
- if ( vlapic_virtual_intr_delivery_enabled() )
|
||||
+ if ( vlapic_virtual_intr_delivery_enabled() &&
|
||||
+ !nestedhvm_vcpu_in_guestmode(v) )
|
||||
return irr;
|
||||
|
||||
isr = vlapic_find_highest_isr(vlapic);
|
@ -1,43 +0,0 @@
|
||||
# Commit 375a1035002fb257087756a86e6caeda649fc0f1
|
||||
# Date 2013-08-22 10:52:05 +0200
|
||||
# Author Yang Zhang <yang.z.zhang@Intel.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
Nested VMX: Clear APIC-v control bit in vmcs02
|
||||
|
||||
There is no vAPIC-v support, so mask APIC-v control bit when
|
||||
constructing vmcs02.
|
||||
|
||||
Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
|
||||
Acked-by: "Dong, Eddie" <eddie.dong@intel.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/vmx/vvmx.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
|
||||
@@ -613,8 +613,15 @@ void nvmx_update_secondary_exec_control(
|
||||
u32 shadow_cntrl;
|
||||
struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
|
||||
struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
|
||||
+ u32 apicv_bit = SECONDARY_EXEC_APIC_REGISTER_VIRT |
|
||||
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
|
||||
|
||||
+ host_cntrl &= ~apicv_bit;
|
||||
shadow_cntrl = __get_vvmcs(nvcpu->nv_vvmcx, SECONDARY_VM_EXEC_CONTROL);
|
||||
+
|
||||
+ /* No vAPIC-v support, so it shouldn't be set in vmcs12. */
|
||||
+ ASSERT(!(shadow_cntrl & apicv_bit));
|
||||
+
|
||||
nvmx->ept.enabled = !!(shadow_cntrl & SECONDARY_EXEC_ENABLE_EPT);
|
||||
shadow_cntrl |= host_cntrl;
|
||||
__vmwrite(SECONDARY_VM_EXEC_CONTROL, shadow_cntrl);
|
||||
@@ -625,7 +632,12 @@ static void nvmx_update_pin_control(stru
|
||||
u32 shadow_cntrl;
|
||||
struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
|
||||
|
||||
+ host_cntrl &= ~PIN_BASED_POSTED_INTERRUPT;
|
||||
shadow_cntrl = __get_vvmcs(nvcpu->nv_vvmcx, PIN_BASED_VM_EXEC_CONTROL);
|
||||
+
|
||||
+ /* No vAPIC-v support, so it shouldn't be set in vmcs12. */
|
||||
+ ASSERT(!(shadow_cntrl & PIN_BASED_POSTED_INTERRUPT));
|
||||
+
|
||||
shadow_cntrl |= host_cntrl;
|
||||
__vmwrite(PIN_BASED_VM_EXEC_CONTROL, shadow_cntrl);
|
||||
}
|
@ -1,247 +0,0 @@
|
||||
# Commit 84e6af58707520baf59c1c86c29237419e439afb
|
||||
# Date 2013-08-22 10:59:01 +0200
|
||||
# Author Yang Zhang <yang.z.zhang@Intel.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
Nested VMX: Update APIC-v(RVI/SVI) when vmexit to L1
|
||||
|
||||
If enabling APIC-v, all interrupts to L1 are delivered through APIC-v.
|
||||
But when L2 is running, external interrupt will casue L1 vmexit with
|
||||
reason external interrupt. Then L1 will pick up the interrupt through
|
||||
vmcs12. when L1 ack the interrupt, since the APIC-v is enabled when
|
||||
L1 is running, so APIC-v hardware still will do vEOI updating. The problem
|
||||
is that the interrupt is delivered not through APIC-v hardware, this means
|
||||
SVI/RVI/vPPR are not setting, but hardware required them when doing vEOI
|
||||
updating. The solution is that, when L1 tried to pick up the interrupt
|
||||
from vmcs12, then hypervisor will help to update the SVI/RVI/vPPR to make
|
||||
sure the following vEOI updating and vPPR updating corrently.
|
||||
|
||||
Also, since interrupt is delivered through vmcs12, so APIC-v hardware will
|
||||
not cleare vIRR and hypervisor need to clear it before L1 running.
|
||||
|
||||
Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
|
||||
Acked-by: "Dong, Eddie" <eddie.dong@intel.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/irq.c
|
||||
+++ b/xen/arch/x86/hvm/irq.c
|
||||
@@ -437,7 +437,7 @@ struct hvm_intack hvm_vcpu_ack_pending_i
|
||||
intack.vector = (uint8_t)vector;
|
||||
break;
|
||||
case hvm_intsrc_lapic:
|
||||
- if ( !vlapic_ack_pending_irq(v, intack.vector) )
|
||||
+ if ( !vlapic_ack_pending_irq(v, intack.vector, 0) )
|
||||
intack = hvm_intack_none;
|
||||
break;
|
||||
case hvm_intsrc_vector:
|
||||
--- a/xen/arch/x86/hvm/vlapic.c
|
||||
+++ b/xen/arch/x86/hvm/vlapic.c
|
||||
@@ -168,6 +168,14 @@ static uint32_t vlapic_get_ppr(struct vl
|
||||
return ppr;
|
||||
}
|
||||
|
||||
+uint32_t vlapic_set_ppr(struct vlapic *vlapic)
|
||||
+{
|
||||
+ uint32_t ppr = vlapic_get_ppr(vlapic);
|
||||
+
|
||||
+ vlapic_set_reg(vlapic, APIC_PROCPRI, ppr);
|
||||
+ return ppr;
|
||||
+}
|
||||
+
|
||||
static int vlapic_match_logical_addr(struct vlapic *vlapic, uint8_t mda)
|
||||
{
|
||||
int result = 0;
|
||||
@@ -1050,15 +1058,15 @@ int vlapic_has_pending_irq(struct vcpu *
|
||||
return irr;
|
||||
}
|
||||
|
||||
-int vlapic_ack_pending_irq(struct vcpu *v, int vector)
|
||||
+int vlapic_ack_pending_irq(struct vcpu *v, int vector, bool_t force_ack)
|
||||
{
|
||||
struct vlapic *vlapic = vcpu_vlapic(v);
|
||||
|
||||
- if ( vlapic_virtual_intr_delivery_enabled() )
|
||||
- return 1;
|
||||
-
|
||||
- vlapic_set_vector(vector, &vlapic->regs->data[APIC_ISR]);
|
||||
- vlapic_clear_irr(vector, vlapic);
|
||||
+ if ( force_ack || !vlapic_virtual_intr_delivery_enabled() )
|
||||
+ {
|
||||
+ vlapic_set_vector(vector, &vlapic->regs->data[APIC_ISR]);
|
||||
+ vlapic_clear_irr(vector, vlapic);
|
||||
+ }
|
||||
|
||||
return 1;
|
||||
}
|
||||
--- a/xen/arch/x86/hvm/vmx/intr.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/intr.c
|
||||
@@ -185,7 +185,7 @@ static int nvmx_intr_intercept(struct vc
|
||||
if ( !(ctrl & PIN_BASED_EXT_INTR_MASK) )
|
||||
return 0;
|
||||
|
||||
- vmx_inject_extint(intack.vector);
|
||||
+ vmx_inject_extint(intack.vector, intack.source);
|
||||
|
||||
ctrl = __get_vvmcs(vcpu_nestedhvm(v).nv_vvmcx, VM_EXIT_CONTROLS);
|
||||
if ( ctrl & VM_EXIT_ACK_INTR_ON_EXIT )
|
||||
@@ -314,7 +314,7 @@ void vmx_intr_assist(void)
|
||||
else
|
||||
{
|
||||
HVMTRACE_2D(INJ_VIRQ, intack.vector, /*fake=*/ 0);
|
||||
- vmx_inject_extint(intack.vector);
|
||||
+ vmx_inject_extint(intack.vector, intack.source);
|
||||
pt_intr_post(v, intack);
|
||||
}
|
||||
|
||||
--- a/xen/arch/x86/hvm/vmx/vmx.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vmx.c
|
||||
@@ -1205,7 +1205,7 @@ static void vmx_update_guest_efer(struct
|
||||
}
|
||||
|
||||
void nvmx_enqueue_n2_exceptions(struct vcpu *v,
|
||||
- unsigned long intr_fields, int error_code)
|
||||
+ unsigned long intr_fields, int error_code, uint8_t source)
|
||||
{
|
||||
struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
|
||||
|
||||
@@ -1213,6 +1213,7 @@ void nvmx_enqueue_n2_exceptions(struct v
|
||||
/* enqueue the exception till the VMCS switch back to L1 */
|
||||
nvmx->intr.intr_info = intr_fields;
|
||||
nvmx->intr.error_code = error_code;
|
||||
+ nvmx->intr.source = source;
|
||||
vcpu_nestedhvm(v).nv_vmexit_pending = 1;
|
||||
return;
|
||||
}
|
||||
@@ -1224,7 +1225,8 @@ void nvmx_enqueue_n2_exceptions(struct v
|
||||
|
||||
static int nvmx_vmexit_trap(struct vcpu *v, struct hvm_trap *trap)
|
||||
{
|
||||
- nvmx_enqueue_n2_exceptions(v, trap->vector, trap->error_code);
|
||||
+ nvmx_enqueue_n2_exceptions(v, trap->vector, trap->error_code,
|
||||
+ hvm_intsrc_none);
|
||||
return NESTEDHVM_VMEXIT_DONE;
|
||||
}
|
||||
|
||||
@@ -1255,7 +1257,7 @@ static void __vmx_inject_exception(int t
|
||||
curr->arch.hvm_vmx.vmx_emulate = 1;
|
||||
}
|
||||
|
||||
-void vmx_inject_extint(int trap)
|
||||
+void vmx_inject_extint(int trap, uint8_t source)
|
||||
{
|
||||
struct vcpu *v = current;
|
||||
u32 pin_based_cntrl;
|
||||
@@ -1266,7 +1268,7 @@ void vmx_inject_extint(int trap)
|
||||
if ( pin_based_cntrl & PIN_BASED_EXT_INTR_MASK ) {
|
||||
nvmx_enqueue_n2_exceptions (v,
|
||||
INTR_INFO_VALID_MASK | (X86_EVENTTYPE_EXT_INTR<<8) | trap,
|
||||
- HVM_DELIVER_NO_ERROR_CODE);
|
||||
+ HVM_DELIVER_NO_ERROR_CODE, source);
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -1285,7 +1287,7 @@ void vmx_inject_nmi(void)
|
||||
if ( pin_based_cntrl & PIN_BASED_NMI_EXITING ) {
|
||||
nvmx_enqueue_n2_exceptions (v,
|
||||
INTR_INFO_VALID_MASK | (X86_EVENTTYPE_NMI<<8) | TRAP_nmi,
|
||||
- HVM_DELIVER_NO_ERROR_CODE);
|
||||
+ HVM_DELIVER_NO_ERROR_CODE, hvm_intsrc_nmi);
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -1353,7 +1355,7 @@ static void vmx_inject_trap(struct hvm_t
|
||||
{
|
||||
nvmx_enqueue_n2_exceptions (curr,
|
||||
INTR_INFO_VALID_MASK | (_trap.type<<8) | _trap.vector,
|
||||
- _trap.error_code);
|
||||
+ _trap.error_code, hvm_intsrc_none);
|
||||
return;
|
||||
}
|
||||
else
|
||||
--- a/xen/arch/x86/hvm/vmx/vvmx.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
|
||||
@@ -1295,6 +1295,36 @@ static void sync_exception_state(struct
|
||||
}
|
||||
}
|
||||
|
||||
+static void nvmx_update_apicv(struct vcpu *v)
|
||||
+{
|
||||
+ struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
|
||||
+ struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
|
||||
+ unsigned long reason = __get_vvmcs(nvcpu->nv_vvmcx, VM_EXIT_REASON);
|
||||
+ uint32_t intr_info = __get_vvmcs(nvcpu->nv_vvmcx, VM_EXIT_INTR_INFO);
|
||||
+
|
||||
+ if ( reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
|
||||
+ nvmx->intr.source == hvm_intsrc_lapic &&
|
||||
+ (intr_info & INTR_INFO_VALID_MASK) )
|
||||
+ {
|
||||
+ uint16_t status;
|
||||
+ uint32_t rvi, ppr;
|
||||
+ uint32_t vector = intr_info & 0xff;
|
||||
+ struct vlapic *vlapic = vcpu_vlapic(v);
|
||||
+
|
||||
+ vlapic_ack_pending_irq(v, vector, 1);
|
||||
+
|
||||
+ ppr = vlapic_set_ppr(vlapic);
|
||||
+ WARN_ON((ppr & 0xf0) != (vector & 0xf0));
|
||||
+
|
||||
+ status = vector << 8;
|
||||
+ rvi = vlapic_has_pending_irq(v);
|
||||
+ if ( rvi != -1 )
|
||||
+ status |= rvi & 0xff;
|
||||
+
|
||||
+ __vmwrite(GUEST_INTR_STATUS, status);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
static void virtual_vmexit(struct cpu_user_regs *regs)
|
||||
{
|
||||
struct vcpu *v = current;
|
||||
@@ -1340,6 +1370,9 @@ static void virtual_vmexit(struct cpu_us
|
||||
/* updating host cr0 to sync TS bit */
|
||||
__vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
|
||||
|
||||
+ if ( cpu_has_vmx_virtual_intr_delivery )
|
||||
+ nvmx_update_apicv(v);
|
||||
+
|
||||
vmreturn(regs, VMSUCCEED);
|
||||
}
|
||||
|
||||
--- a/xen/include/asm-x86/hvm/vlapic.h
|
||||
+++ b/xen/include/asm-x86/hvm/vlapic.h
|
||||
@@ -98,7 +98,7 @@ bool_t is_vlapic_lvtpc_enabled(struct vl
|
||||
void vlapic_set_irq(struct vlapic *vlapic, uint8_t vec, uint8_t trig);
|
||||
|
||||
int vlapic_has_pending_irq(struct vcpu *v);
|
||||
-int vlapic_ack_pending_irq(struct vcpu *v, int vector);
|
||||
+int vlapic_ack_pending_irq(struct vcpu *v, int vector, bool_t force_ack);
|
||||
|
||||
int vlapic_init(struct vcpu *v);
|
||||
void vlapic_destroy(struct vcpu *v);
|
||||
@@ -110,6 +110,7 @@ void vlapic_tdt_msr_set(struct vlapic *v
|
||||
uint64_t vlapic_tdt_msr_get(struct vlapic *vlapic);
|
||||
|
||||
int vlapic_accept_pic_intr(struct vcpu *v);
|
||||
+uint32_t vlapic_set_ppr(struct vlapic *vlapic);
|
||||
|
||||
void vlapic_adjust_i8259_target(struct domain *d);
|
||||
|
||||
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
|
||||
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
|
||||
@@ -448,7 +448,7 @@ static inline int __vmxon(u64 addr)
|
||||
|
||||
void vmx_get_segment_register(struct vcpu *, enum x86_segment,
|
||||
struct segment_register *);
|
||||
-void vmx_inject_extint(int trap);
|
||||
+void vmx_inject_extint(int trap, uint8_t source);
|
||||
void vmx_inject_nmi(void);
|
||||
|
||||
int ept_p2m_init(struct p2m_domain *p2m);
|
||||
--- a/xen/include/asm-x86/hvm/vmx/vvmx.h
|
||||
+++ b/xen/include/asm-x86/hvm/vmx/vvmx.h
|
||||
@@ -36,6 +36,7 @@ struct nestedvmx {
|
||||
struct {
|
||||
unsigned long intr_info;
|
||||
u32 error_code;
|
||||
+ u8 source;
|
||||
} intr;
|
||||
struct {
|
||||
bool_t enabled;
|
@ -1,24 +0,0 @@
|
||||
References: bnc#835896
|
||||
|
||||
# Commit 69962e19ed432570f6cdcfdb5f6f22d6e3c54e6c
|
||||
# Date 2013-08-22 11:24:00 +0200
|
||||
# Author Juergen Gross <juergen.gross@ts.fujitsu.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
Correct X2-APIC HVM emulation
|
||||
|
||||
commit 6859874b61d5ddaf5289e72ed2b2157739b72ca5 ("x86/HVM: fix x2APIC
|
||||
APIC_ID read emulation") introduced an error for the hvm emulation of
|
||||
x2apic. Any try to write to APIC_ICR MSR will result in a GP fault.
|
||||
|
||||
Signed-off-by: Juergen Gross <juergen.gross@ts.fujitsu.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/vlapic.c
|
||||
+++ b/xen/arch/x86/hvm/vlapic.c
|
||||
@@ -868,6 +868,7 @@ int hvm_x2apic_msr_write(struct vcpu *v,
|
||||
rc = vlapic_reg_write(v, APIC_ICR2, (uint32_t)(msr_content >> 32));
|
||||
if ( rc )
|
||||
return rc;
|
||||
+ break;
|
||||
|
||||
case APIC_ICR2:
|
||||
return X86EMUL_UNHANDLEABLE;
|
@ -1,24 +0,0 @@
|
||||
# Commit 850188e1278cecd1dfb9b936024bee2d8dfdcc18
|
||||
# Date 2013-08-27 11:11:38 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: don't allow Dom0 access to the MSI address range
|
||||
|
||||
In particular, MMIO assignments should not be done using this area.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by Xiantao Zhang <xiantao.zhang@intel.com>
|
||||
|
||||
--- a/xen/arch/x86/domain_build.c
|
||||
+++ b/xen/arch/x86/domain_build.c
|
||||
@@ -1122,6 +1122,10 @@ int __init construct_dom0(
|
||||
if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn) )
|
||||
rc |= iomem_deny_access(dom0, mfn, mfn);
|
||||
}
|
||||
+ /* MSI range. */
|
||||
+ rc |= iomem_deny_access(dom0, paddr_to_pfn(MSI_ADDR_BASE_LO),
|
||||
+ paddr_to_pfn(MSI_ADDR_BASE_LO +
|
||||
+ MSI_ADDR_DEST_ID_MASK));
|
||||
|
||||
/* Remove access to E820_UNUSABLE I/O regions above 1MB. */
|
||||
for ( i = 0; i < e820.nr_map; i++ )
|
@ -1,52 +0,0 @@
|
||||
# Commit 3e787021fb2420851c7bdc3911ea53c728ba5ac0
|
||||
# Date 2013-08-27 11:15:15 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/Intel: add support for Haswell CPU models
|
||||
|
||||
... according to their most recent public documentation.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/acpi/cpu_idle.c
|
||||
+++ b/xen/arch/x86/acpi/cpu_idle.c
|
||||
@@ -135,8 +135,10 @@ static void do_get_hw_residencies(void *
|
||||
case 0x3A:
|
||||
case 0x3E:
|
||||
/* Haswell */
|
||||
- case 0x3c:
|
||||
+ case 0x3C:
|
||||
+ case 0x3F:
|
||||
case 0x45:
|
||||
+ case 0x46:
|
||||
GET_PC2_RES(hw_res->pc2);
|
||||
GET_CC7_RES(hw_res->cc7);
|
||||
/* fall through */
|
||||
--- a/xen/arch/x86/hvm/vmx/vmx.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vmx.c
|
||||
@@ -1814,7 +1814,7 @@ static const struct lbr_info *last_branc
|
||||
/* Ivy Bridge */
|
||||
case 58: case 62:
|
||||
/* Haswell */
|
||||
- case 60: case 69:
|
||||
+ case 60: case 63: case 69: case 70:
|
||||
return nh_lbr;
|
||||
break;
|
||||
/* Atom */
|
||||
--- a/xen/arch/x86/hvm/vmx/vpmu_core2.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vpmu_core2.c
|
||||
@@ -878,7 +878,12 @@ int vmx_vpmu_initialise(struct vcpu *v,
|
||||
|
||||
case 0x3a: /* IvyBridge */
|
||||
case 0x3e: /* IvyBridge EP */
|
||||
- case 0x3c: /* Haswell */
|
||||
+
|
||||
+ /* Haswell: */
|
||||
+ case 0x3c:
|
||||
+ case 0x3f:
|
||||
+ case 0x45:
|
||||
+ case 0x46:
|
||||
ret = core2_vpmu_initialise(v, vpmu_flags);
|
||||
if ( !ret )
|
||||
vpmu->arch_vpmu_ops = &core2_vpmu_ops;
|
@ -1,42 +0,0 @@
|
||||
# Commit 9e2c5938246546a5b3f698b7421640d85602b994
|
||||
# Date 2013-08-28 10:18:39 +0200
|
||||
# Author Tomasz Wroblewski <tomasz.wroblewski@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
Fix inactive timer list corruption on second S3 resume
|
||||
|
||||
init_timer cannot be safely called multiple times on same timer since it does memset(0)
|
||||
on the structure, erasing the auxiliary member used by linked list code. This breaks
|
||||
inactive timer list in common/timer.c.
|
||||
|
||||
Moved resume_timer initialisation to ns16550_init_postirq, so it's only done once.
|
||||
|
||||
Signed-off-by: Tomasz Wroblewski <tomasz.wroblewski@citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/drivers/char/ns16550.c
|
||||
+++ b/xen/drivers/char/ns16550.c
|
||||
@@ -128,6 +128,8 @@ static struct ns16550 {
|
||||
#define RESUME_DELAY MILLISECS(10)
|
||||
#define RESUME_RETRIES 100
|
||||
|
||||
+static void ns16550_delayed_resume(void *data);
|
||||
+
|
||||
static char ns_read_reg(struct ns16550 *uart, int reg)
|
||||
{
|
||||
if ( uart->remapped_io_base == NULL )
|
||||
@@ -323,6 +325,7 @@ static void __init ns16550_init_postirq(
|
||||
serial_async_transmit(port);
|
||||
|
||||
init_timer(&uart->timer, ns16550_poll, port, 0);
|
||||
+ init_timer(&uart->resume_timer, ns16550_delayed_resume, port, 0);
|
||||
|
||||
/* Calculate time to fill RX FIFO and/or empty TX FIFO for polling. */
|
||||
bits = uart->data_bits + uart->stop_bits + !!uart->parity;
|
||||
@@ -413,7 +416,6 @@ static void ns16550_resume(struct serial
|
||||
if ( ns16550_ioport_invalid(uart) )
|
||||
{
|
||||
delayed_resume_tries = RESUME_RETRIES;
|
||||
- init_timer(&uart->resume_timer, ns16550_delayed_resume, port, 0);
|
||||
set_timer(&uart->resume_timer, NOW() + RESUME_DELAY);
|
||||
}
|
||||
else
|
@ -1,254 +0,0 @@
|
||||
# Commit 062919448e2f4b127c9c3c085b1a8e1d56a33051
|
||||
# Date 2013-08-28 17:03:50 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: AVX instruction emulation fixes
|
||||
|
||||
- we used the C4/C5 (first prefix) byte instead of the apparent ModR/M
|
||||
one as the second prefix byte
|
||||
- early decoding normalized vex.reg, thus corrupting it for the main
|
||||
consumer (copy_REX_VEX()), resulting in #UD on the two-operand
|
||||
instructions we emulate
|
||||
|
||||
Also add respective test cases to the testing utility plus
|
||||
- fix get_fpu() (the fall-through order was inverted)
|
||||
- add cpu_has_avx2, even if it's currently unused (as in the new test
|
||||
cases I decided to refrain from using AVX2 instructions in order to
|
||||
be able to actually run all the tests on the hardware I have)
|
||||
- slightly tweak cpu_has_avx to more consistently express the outputs
|
||||
we don't care about (sinking them all into the same variable)
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/tools/tests/x86_emulator/test_x86_emulator.c
|
||||
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
|
||||
@@ -94,13 +94,25 @@ static inline uint64_t xgetbv(uint32_t x
|
||||
}
|
||||
|
||||
#define cpu_has_avx ({ \
|
||||
- unsigned int eax = 1, ecx = 0, edx; \
|
||||
- cpuid(&eax, &edx, &ecx, &edx, NULL); \
|
||||
+ unsigned int eax = 1, ecx = 0; \
|
||||
+ cpuid(&eax, &eax, &ecx, &eax, NULL); \
|
||||
if ( !(ecx & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
|
||||
ecx = 0; \
|
||||
(ecx & (1U << 28)) != 0; \
|
||||
})
|
||||
|
||||
+#define cpu_has_avx2 ({ \
|
||||
+ unsigned int eax = 1, ebx, ecx = 0; \
|
||||
+ cpuid(&eax, &ebx, &ecx, &eax, NULL); \
|
||||
+ if ( !(ecx & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
|
||||
+ ebx = 0; \
|
||||
+ else { \
|
||||
+ eax = 7, ecx = 0; \
|
||||
+ cpuid(&eax, &ebx, &ecx, &eax, NULL); \
|
||||
+ } \
|
||||
+ (ebx & (1U << 5)) != 0; \
|
||||
+})
|
||||
+
|
||||
int get_fpu(
|
||||
void (*exception_callback)(void *, struct cpu_user_regs *),
|
||||
void *exception_callback_arg,
|
||||
@@ -111,14 +123,14 @@ int get_fpu(
|
||||
{
|
||||
case X86EMUL_FPU_fpu:
|
||||
break;
|
||||
- case X86EMUL_FPU_ymm:
|
||||
- if ( cpu_has_avx )
|
||||
+ case X86EMUL_FPU_mmx:
|
||||
+ if ( cpu_has_mmx )
|
||||
break;
|
||||
case X86EMUL_FPU_xmm:
|
||||
if ( cpu_has_sse )
|
||||
break;
|
||||
- case X86EMUL_FPU_mmx:
|
||||
- if ( cpu_has_mmx )
|
||||
+ case X86EMUL_FPU_ymm:
|
||||
+ if ( cpu_has_avx )
|
||||
break;
|
||||
default:
|
||||
return X86EMUL_UNHANDLEABLE;
|
||||
@@ -629,6 +641,73 @@ int main(int argc, char **argv)
|
||||
else
|
||||
printf("skipped\n");
|
||||
|
||||
+ printf("%-40s", "Testing vmovdqu %ymm2,(%ecx)...");
|
||||
+ if ( stack_exec && cpu_has_avx )
|
||||
+ {
|
||||
+ extern const unsigned char vmovdqu_to_mem[];
|
||||
+
|
||||
+ asm volatile ( "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n"
|
||||
+ ".pushsection .test, \"a\", @progbits\n"
|
||||
+ "vmovdqu_to_mem: vmovdqu %%ymm2, (%0)\n"
|
||||
+ ".popsection" :: "c" (NULL) );
|
||||
+
|
||||
+ memcpy(instr, vmovdqu_to_mem, 15);
|
||||
+ memset(res, 0x55, 128);
|
||||
+ memset(res + 16, 0xff, 16);
|
||||
+ memset(res + 20, 0x00, 16);
|
||||
+ regs.eip = (unsigned long)&instr[0];
|
||||
+ regs.ecx = (unsigned long)res;
|
||||
+ rc = x86_emulate(&ctxt, &emulops);
|
||||
+ if ( (rc != X86EMUL_OKAY) || memcmp(res, res + 16, 64) )
|
||||
+ goto fail;
|
||||
+ printf("okay\n");
|
||||
+ }
|
||||
+ else
|
||||
+ printf("skipped\n");
|
||||
+
|
||||
+ printf("%-40s", "Testing vmovdqu (%edx),%ymm4...");
|
||||
+ if ( stack_exec && cpu_has_avx )
|
||||
+ {
|
||||
+ extern const unsigned char vmovdqu_from_mem[];
|
||||
+
|
||||
+#if 0 /* Don't use AVX2 instructions for now */
|
||||
+ asm volatile ( "vpcmpgtb %%ymm4, %%ymm4, %%ymm4\n"
|
||||
+#else
|
||||
+ asm volatile ( "vpcmpgtb %%xmm4, %%xmm4, %%xmm4\n\t"
|
||||
+ "vinsertf128 $1, %%xmm4, %%ymm4, %%ymm4\n"
|
||||
+#endif
|
||||
+ ".pushsection .test, \"a\", @progbits\n"
|
||||
+ "vmovdqu_from_mem: vmovdqu (%0), %%ymm4\n"
|
||||
+ ".popsection" :: "d" (NULL) );
|
||||
+
|
||||
+ memcpy(instr, vmovdqu_from_mem, 15);
|
||||
+ memset(res + 4, 0xff, 16);
|
||||
+ regs.eip = (unsigned long)&instr[0];
|
||||
+ regs.ecx = 0;
|
||||
+ regs.edx = (unsigned long)res;
|
||||
+ rc = x86_emulate(&ctxt, &emulops);
|
||||
+ if ( rc != X86EMUL_OKAY )
|
||||
+ goto fail;
|
||||
+#if 0 /* Don't use AVX2 instructions for now */
|
||||
+ asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
|
||||
+ "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
|
||||
+ "vpmovmskb %%ymm1, %0" : "=r" (rc) );
|
||||
+#else
|
||||
+ asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
|
||||
+ "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
|
||||
+ "vpcmpeqb %%xmm4, %%xmm2, %%xmm0\n\t"
|
||||
+ "vpcmpeqb %%xmm3, %%xmm2, %%xmm1\n\t"
|
||||
+ "vpmovmskb %%xmm0, %0\n\t"
|
||||
+ "vpmovmskb %%xmm1, %1" : "=r" (rc), "=r" (i) );
|
||||
+ rc |= i << 16;
|
||||
+#endif
|
||||
+ if ( rc != 0xffffffff )
|
||||
+ goto fail;
|
||||
+ printf("okay\n");
|
||||
+ }
|
||||
+ else
|
||||
+ printf("skipped\n");
|
||||
+
|
||||
printf("%-40s", "Testing movsd %xmm5,(%ecx)...");
|
||||
memset(res, 0x77, 64);
|
||||
memset(res + 10, 0x66, 8);
|
||||
@@ -683,6 +762,59 @@ int main(int argc, char **argv)
|
||||
else
|
||||
printf("skipped\n");
|
||||
|
||||
+ printf("%-40s", "Testing vmovsd %xmm5,(%ecx)...");
|
||||
+ memset(res, 0x88, 64);
|
||||
+ memset(res + 10, 0x77, 8);
|
||||
+ if ( stack_exec && cpu_has_avx )
|
||||
+ {
|
||||
+ extern const unsigned char vmovsd_to_mem[];
|
||||
+
|
||||
+ asm volatile ( "vbroadcastsd %0, %%ymm5\n"
|
||||
+ ".pushsection .test, \"a\", @progbits\n"
|
||||
+ "vmovsd_to_mem: vmovsd %%xmm5, (%1)\n"
|
||||
+ ".popsection" :: "m" (res[10]), "c" (NULL) );
|
||||
+
|
||||
+ memcpy(instr, vmovsd_to_mem, 15);
|
||||
+ regs.eip = (unsigned long)&instr[0];
|
||||
+ regs.ecx = (unsigned long)(res + 2);
|
||||
+ regs.edx = 0;
|
||||
+ rc = x86_emulate(&ctxt, &emulops);
|
||||
+ if ( (rc != X86EMUL_OKAY) || memcmp(res, res + 8, 32) )
|
||||
+ goto fail;
|
||||
+ printf("okay\n");
|
||||
+ }
|
||||
+ else
|
||||
+ {
|
||||
+ printf("skipped\n");
|
||||
+ memset(res + 2, 0x77, 8);
|
||||
+ }
|
||||
+
|
||||
+ printf("%-40s", "Testing vmovaps (%edx),%ymm7...");
|
||||
+ if ( stack_exec && cpu_has_avx )
|
||||
+ {
|
||||
+ extern const unsigned char vmovaps_from_mem[];
|
||||
+
|
||||
+ asm volatile ( "vxorps %%ymm7, %%ymm7, %%ymm7\n"
|
||||
+ ".pushsection .test, \"a\", @progbits\n"
|
||||
+ "vmovaps_from_mem: vmovaps (%0), %%ymm7\n"
|
||||
+ ".popsection" :: "d" (NULL) );
|
||||
+
|
||||
+ memcpy(instr, vmovaps_from_mem, 15);
|
||||
+ regs.eip = (unsigned long)&instr[0];
|
||||
+ regs.ecx = 0;
|
||||
+ regs.edx = (unsigned long)res;
|
||||
+ rc = x86_emulate(&ctxt, &emulops);
|
||||
+ if ( rc != X86EMUL_OKAY )
|
||||
+ goto fail;
|
||||
+ asm ( "vcmpeqps %1, %%ymm7, %%ymm0\n\t"
|
||||
+ "vmovmskps %%ymm0, %0" : "=r" (rc) : "m" (res[8]) );
|
||||
+ if ( rc != 0xff )
|
||||
+ goto fail;
|
||||
+ printf("okay\n");
|
||||
+ }
|
||||
+ else
|
||||
+ printf("skipped\n");
|
||||
+
|
||||
for ( j = 1; j <= 2; j++ )
|
||||
{
|
||||
#if defined(__i386__)
|
||||
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
|
||||
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
|
||||
@@ -1454,10 +1454,10 @@ x86_emulate(
|
||||
/* VEX */
|
||||
generate_exception_if(rex_prefix || vex.pfx, EXC_UD, -1);
|
||||
|
||||
- vex.raw[0] = b;
|
||||
+ vex.raw[0] = modrm;
|
||||
if ( b & 1 )
|
||||
{
|
||||
- vex.raw[1] = b;
|
||||
+ vex.raw[1] = modrm;
|
||||
vex.opcx = vex_0f;
|
||||
vex.x = 1;
|
||||
vex.b = 1;
|
||||
@@ -1479,10 +1479,7 @@ x86_emulate(
|
||||
}
|
||||
}
|
||||
}
|
||||
- vex.reg ^= 0xf;
|
||||
- if ( !mode_64bit() )
|
||||
- vex.reg &= 0x7;
|
||||
- else if ( !vex.r )
|
||||
+ if ( mode_64bit() && !vex.r )
|
||||
rex_prefix |= REX_R;
|
||||
|
||||
fail_if(vex.opcx != vex_0f);
|
||||
@@ -3899,8 +3896,9 @@ x86_emulate(
|
||||
else
|
||||
{
|
||||
fail_if((vex.opcx != vex_0f) ||
|
||||
- (vex.reg && ((ea.type == OP_MEM) ||
|
||||
- !(vex.pfx & VEX_PREFIX_SCALAR_MASK))));
|
||||
+ ((vex.reg != 0xf) &&
|
||||
+ ((ea.type == OP_MEM) ||
|
||||
+ !(vex.pfx & VEX_PREFIX_SCALAR_MASK))));
|
||||
vcpu_must_have_avx();
|
||||
get_fpu(X86EMUL_FPU_ymm, &fic);
|
||||
ea.bytes = 16 << vex.l;
|
||||
@@ -4168,7 +4166,7 @@ x86_emulate(
|
||||
}
|
||||
else
|
||||
{
|
||||
- fail_if((vex.opcx != vex_0f) || vex.reg ||
|
||||
+ fail_if((vex.opcx != vex_0f) || (vex.reg != 0xf) ||
|
||||
((vex.pfx != vex_66) && (vex.pfx != vex_f3)));
|
||||
vcpu_must_have_avx();
|
||||
get_fpu(X86EMUL_FPU_ymm, &fic);
|
@ -1,29 +0,0 @@
|
||||
# Commit 3785d30efe8264b899499e0883b10cc434bd0959
|
||||
# Date 2013-08-29 09:31:37 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
AMD IOMMU: add missing check
|
||||
|
||||
We shouldn't accept IVHD tables specifying IO-APIC IDs beyond the limit
|
||||
we support (MAX_IO_APICS, currently 128).
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Suravee Suthikulpanit <suravee.suthikulapanit@amd.com>
|
||||
|
||||
--- a/xen/drivers/passthrough/amd/iommu_acpi.c
|
||||
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
|
||||
@@ -674,6 +674,13 @@ static u16 __init parse_ivhd_device_spec
|
||||
if ( IO_APIC_ID(apic) != special->handle )
|
||||
continue;
|
||||
|
||||
+ if ( special->handle >= ARRAY_SIZE(ioapic_sbdf) )
|
||||
+ {
|
||||
+ printk(XENLOG_ERR "IVHD Error: IO-APIC %#x entry beyond bounds\n",
|
||||
+ special->handle);
|
||||
+ return 0;
|
||||
+ }
|
||||
+
|
||||
if ( ioapic_sbdf[special->handle].pin_2_idx )
|
||||
{
|
||||
if ( ioapic_sbdf[special->handle].bdf == bdf &&
|
@ -1,28 +0,0 @@
|
||||
# Commit 4aa19549e17650b9bfe2b31d7f52a95696d388f0
|
||||
# Date 2013-08-30 10:40:29 +0200
|
||||
# Author Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
hvmloader/smbios: Correctly count the number of tables written
|
||||
|
||||
Fixes regression indirectly introduced by c/s 4d23036e709627
|
||||
|
||||
That changeset added some smbios tables which were option based on the
|
||||
toolstack providing appropriate xenstore keys. The do_struct() macro would
|
||||
unconditionally increment nr_structs, even if a table was not actually
|
||||
written.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/tools/firmware/hvmloader/smbios.c
|
||||
+++ b/tools/firmware/hvmloader/smbios.c
|
||||
@@ -192,7 +192,8 @@ write_smbios_tables(void *ep, void *star
|
||||
|
||||
#define do_struct(fn) do { \
|
||||
q = (fn); \
|
||||
- (*nr_structs)++; \
|
||||
+ if ( q != p ) \
|
||||
+ (*nr_structs)++; \
|
||||
if ( (q - p) > *max_struct_size ) \
|
||||
*max_struct_size = q - p; \
|
||||
p = q; \
|
@ -1,42 +0,0 @@
|
||||
# Commit 0f4cb23c3ea5b987c49c9a9368e7a0d505ec064f
|
||||
# Date 2013-08-30 10:40:48 +0200
|
||||
# Author Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
public/hvm_xs_strings.h: Fix ABI regression for OEM SMBios strings
|
||||
|
||||
The old code for OEM SMBios strings was:
|
||||
|
||||
char path[20] = "bios-strings/oem-XX";
|
||||
path[(sizeof path) - 3] = '0' + ((i < 10) ? i : i / 10);
|
||||
path[(sizeof path) - 2] = (i < 10) ? '\0' : '0' + (i % 10);
|
||||
|
||||
Where oem-1 thru 9 specifically had no leading 0.
|
||||
|
||||
However, the definition of HVM_XS_OEM_STRINGS specifically requires leading
|
||||
0s.
|
||||
|
||||
This regression was introduced by the combination of c/s 4d23036e709627 and
|
||||
e64c3f71ceb662
|
||||
|
||||
I realise that this patch causes a change to the public headers. However I
|
||||
feel it is justified as:
|
||||
|
||||
* All toolstacks used to have to embed the magic string (and almost certainly
|
||||
still do)
|
||||
* If by some miriacle a new toolstack has started using the new define will
|
||||
continue to work.
|
||||
* The only intree consumer of the define is hvmloader itself.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/include/public/hvm/hvm_xs_strings.h
|
||||
+++ b/xen/include/public/hvm/hvm_xs_strings.h
|
||||
@@ -75,6 +75,6 @@
|
||||
/* 1 to 99 OEM strings can be set in xenstore using values of the form
|
||||
* below. These strings will be loaded into the SMBIOS type 11 structure.
|
||||
*/
|
||||
-#define HVM_XS_OEM_STRINGS "bios-strings/oem-%02d"
|
||||
+#define HVM_XS_OEM_STRINGS "bios-strings/oem-%d"
|
||||
|
||||
#endif /* __XEN_PUBLIC_HVM_HVM_XS_STRINGS_H__ */
|
@ -1,105 +0,0 @@
|
||||
References: bnc#833796
|
||||
|
||||
# Commit c6066e78f4a66005b0d5d86c6ade32e2ab78923a
|
||||
# Date 2013-08-30 10:56:07 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/xsave: initialization improvements
|
||||
|
||||
- properly validate available feature set on APs
|
||||
- also validate xsaveopt availability on APs
|
||||
- properly indicate whether the initialization is on the BSP (we
|
||||
shouldn't be using "cpu == 0" checks for this)
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/cpu/common.c
|
||||
+++ b/xen/arch/x86/cpu/common.c
|
||||
@@ -304,7 +304,7 @@ void __cpuinit identify_cpu(struct cpuin
|
||||
clear_bit(X86_FEATURE_XSAVE, boot_cpu_data.x86_capability);
|
||||
|
||||
if ( cpu_has_xsave )
|
||||
- xstate_init();
|
||||
+ xstate_init(c == &boot_cpu_data);
|
||||
|
||||
/*
|
||||
* The vendor-specific functions might have changed features. Now
|
||||
--- a/xen/arch/x86/xstate.c
|
||||
+++ b/xen/arch/x86/xstate.c
|
||||
@@ -247,11 +247,10 @@ void xstate_free_save_area(struct vcpu *
|
||||
}
|
||||
|
||||
/* Collect the information of processor's extended state */
|
||||
-void xstate_init(void)
|
||||
+void xstate_init(bool_t bsp)
|
||||
{
|
||||
- u32 eax, ebx, ecx, edx;
|
||||
- int cpu = smp_processor_id();
|
||||
- u32 min_size;
|
||||
+ u32 eax, ebx, ecx, edx, min_size;
|
||||
+ u64 feature_mask;
|
||||
|
||||
if ( boot_cpu_data.cpuid_level < XSTATE_CPUID )
|
||||
return;
|
||||
@@ -260,6 +259,7 @@ void xstate_init(void)
|
||||
|
||||
BUG_ON((eax & XSTATE_FP_SSE) != XSTATE_FP_SSE);
|
||||
BUG_ON((eax & XSTATE_YMM) && !(eax & XSTATE_SSE));
|
||||
+ feature_mask = (((u64)edx << 32) | eax) & XCNTXT_MASK;
|
||||
|
||||
/* FP/SSE, XSAVE.HEADER, YMM */
|
||||
min_size = XSTATE_AREA_MIN_SIZE;
|
||||
@@ -271,31 +271,33 @@ void xstate_init(void)
|
||||
* Set CR4_OSXSAVE and run "cpuid" to get xsave_cntxt_size.
|
||||
*/
|
||||
set_in_cr4(X86_CR4_OSXSAVE);
|
||||
- if ( !set_xcr0((((u64)edx << 32) | eax) & XCNTXT_MASK) )
|
||||
+ if ( !set_xcr0(feature_mask) )
|
||||
BUG();
|
||||
cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
- if ( cpu == 0 )
|
||||
+ if ( bsp )
|
||||
{
|
||||
+ xfeature_mask = feature_mask;
|
||||
/*
|
||||
* xsave_cntxt_size is the max size required by enabled features.
|
||||
* We know FP/SSE and YMM about eax, and nothing about edx at present.
|
||||
*/
|
||||
xsave_cntxt_size = ebx;
|
||||
- xfeature_mask = eax + ((u64)edx << 32);
|
||||
- xfeature_mask &= XCNTXT_MASK;
|
||||
printk("%s: using cntxt_size: %#x and states: %#"PRIx64"\n",
|
||||
__func__, xsave_cntxt_size, xfeature_mask);
|
||||
-
|
||||
- /* Check XSAVEOPT feature. */
|
||||
- cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
|
||||
- cpu_has_xsaveopt = !!(eax & XSTATE_FEATURE_XSAVEOPT);
|
||||
}
|
||||
else
|
||||
{
|
||||
+ BUG_ON(xfeature_mask != feature_mask);
|
||||
BUG_ON(xsave_cntxt_size != ebx);
|
||||
- BUG_ON(xfeature_mask != (xfeature_mask & XCNTXT_MASK));
|
||||
}
|
||||
+
|
||||
+ /* Check XSAVEOPT feature. */
|
||||
+ cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
|
||||
+ if ( bsp )
|
||||
+ cpu_has_xsaveopt = !!(eax & XSTATE_FEATURE_XSAVEOPT);
|
||||
+ else
|
||||
+ BUG_ON(!cpu_has_xsaveopt != !(eax & XSTATE_FEATURE_XSAVEOPT));
|
||||
}
|
||||
|
||||
int handle_xsetbv(u32 index, u64 new_bv)
|
||||
--- a/xen/include/asm-x86/xstate.h
|
||||
+++ b/xen/include/asm-x86/xstate.h
|
||||
@@ -81,6 +81,6 @@ int __must_check handle_xsetbv(u32 index
|
||||
/* extended state init and cleanup functions */
|
||||
void xstate_free_save_area(struct vcpu *v);
|
||||
int xstate_alloc_save_area(struct vcpu *v);
|
||||
-void xstate_init(void);
|
||||
+void xstate_init(bool_t bsp);
|
||||
|
||||
#endif /* __ASM_XSTATE_H */
|
@ -1,31 +0,0 @@
|
||||
# Commit 1893cf77992cc0ce9d827a8d345437fa2494b540
|
||||
# Date 2013-09-03 16:36:47 +0100
|
||||
# Author Steven Noonan <snoonan@amazon.com>
|
||||
# Committer Ian Campbell <ian.campbell@citrix.com>
|
||||
xend: handle extended PCI configuration space when saving state
|
||||
|
||||
Newer PCI standards (e.g., PCI-X 2.0 and PCIe) introduce extended
|
||||
configuration space which is larger than 256 bytes. This patch uses
|
||||
stat() to determine the amount of space used to correctly save all of
|
||||
the PCI configuration space. Resets handled by the xen-pciback driver
|
||||
don't have this problem, as that code correctly handles saving
|
||||
extended configuration space.
|
||||
|
||||
Signed-off-by: Steven Noonan <snoonan@amazon.com>
|
||||
Reviewed-by: Matt Wilson <msw@amazon.com>
|
||||
[msw: adjusted commit message]
|
||||
Signed-off-by: Matt Wilson <msw@amazon.com>
|
||||
|
||||
--- a/tools/python/xen/util/pci.py
|
||||
+++ b/tools/python/xen/util/pci.py
|
||||
@@ -521,8 +521,9 @@ def save_pci_conf_space(devs_string):
|
||||
pci_path = sysfs_mnt + SYSFS_PCI_DEVS_PATH + '/' + pci_str + \
|
||||
SYSFS_PCI_DEV_CONFIG_PATH
|
||||
fd = os.open(pci_path, os.O_RDONLY)
|
||||
+ size = os.fstat(fd).st_size
|
||||
configs = []
|
||||
- for i in range(0, 256, 4):
|
||||
+ for i in range(0, size, 4):
|
||||
configs = configs + [os.read(fd,4)]
|
||||
os.close(fd)
|
||||
pci_list = pci_list + [pci_path]
|
@ -1,48 +0,0 @@
|
||||
# Commit 749019afca4fd002d36856bad002cc11f7d0ddda
|
||||
# Date 2013-09-03 16:36:52 +0100
|
||||
# Author Xi Xiong <xixiong@amazon.com>
|
||||
# Committer Ian Campbell <ian.campbell@citrix.com>
|
||||
xend: fix file descriptor leak in pci utilities
|
||||
|
||||
A file descriptor leak was detected after creating multiple domUs with
|
||||
pass-through PCI devices. This patch fixes the issue.
|
||||
|
||||
Signed-off-by: Xi Xiong <xixiong@amazon.com>
|
||||
Reviewed-by: Matt Wilson <msw@amazon.com>
|
||||
[msw: adjusted commit message]
|
||||
Signed-off-by: Matt Wilson <msw@amazon.com>
|
||||
|
||||
--- a/tools/python/xen/util/pci.py
|
||||
+++ b/tools/python/xen/util/pci.py
|
||||
@@ -969,18 +969,22 @@ class PciDevice:
|
||||
ttl = 480; # 3840 bytes, minimum 8 bytes per capability
|
||||
pos = 0x100
|
||||
|
||||
+ fd = None
|
||||
try:
|
||||
fd = os.open(path, os.O_RDONLY)
|
||||
os.lseek(fd, pos, 0)
|
||||
h = os.read(fd, 4)
|
||||
if len(h) == 0: # MMCONF is not enabled?
|
||||
+ os.close(fd)
|
||||
return 0
|
||||
header = struct.unpack('I', h)[0]
|
||||
if header == 0 or header == -1:
|
||||
+ os.close(fd)
|
||||
return 0
|
||||
|
||||
while ttl > 0:
|
||||
if (header & 0x0000ffff) == cap:
|
||||
+ os.close(fd)
|
||||
return pos
|
||||
pos = (header >> 20) & 0xffc
|
||||
if pos < 0x100:
|
||||
@@ -990,6 +994,8 @@ class PciDevice:
|
||||
ttl = ttl - 1
|
||||
os.close(fd)
|
||||
except OSError, (errno, strerr):
|
||||
+ if fd is not None:
|
||||
+ os.close(fd)
|
||||
raise PciDeviceParseError(('Error when accessing sysfs: %s (%d)' %
|
||||
(strerr, errno)))
|
||||
return 0
|
@ -1,84 +0,0 @@
|
||||
# Commit 5f2875739beef3a75c7a7e8579b6cbcb464e61b3
|
||||
# Date 2013-09-05 11:47:03 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
hvmloader: fix SeaBIOS interface
|
||||
|
||||
The SeaBIOS ROM image may validly exceed 128k in size, it's only our
|
||||
interface code that so far assumed that it wouldn't. Remove that
|
||||
restriction by setting the base address depending on image size.
|
||||
|
||||
Add a check to HVM loader so that too big images won't result in silent
|
||||
guest failure anymore.
|
||||
|
||||
Uncomment the intended build-time size check for rombios, moving it
|
||||
into a function so that it would actually compile.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
|
||||
--- a/tools/firmware/hvmloader/config-seabios.h
|
||||
+++ b/tools/firmware/hvmloader/config-seabios.h
|
||||
@@ -3,8 +3,6 @@
|
||||
|
||||
#define BIOS_INFO_PHYSICAL_ADDRESS 0x00001000
|
||||
|
||||
-#define SEABIOS_PHYSICAL_ADDRESS 0x000E0000
|
||||
-
|
||||
#endif /* __HVMLOADER_CONFIG_SEABIOS_H__ */
|
||||
|
||||
/*
|
||||
--- a/tools/firmware/hvmloader/hvmloader.c
|
||||
+++ b/tools/firmware/hvmloader/hvmloader.c
|
||||
@@ -292,8 +292,12 @@ int main(void)
|
||||
if ( bios->bios_load )
|
||||
bios->bios_load(bios);
|
||||
else
|
||||
+ {
|
||||
+ BUG_ON(bios->bios_address + bios->image_size >
|
||||
+ HVMLOADER_PHYSICAL_ADDRESS);
|
||||
memcpy((void *)bios->bios_address, bios->image,
|
||||
bios->image_size);
|
||||
+ }
|
||||
|
||||
if ( (hvm_info->nr_vcpus > 1) || hvm_info->apic_mode )
|
||||
{
|
||||
--- a/tools/firmware/hvmloader/rombios.c
|
||||
+++ b/tools/firmware/hvmloader/rombios.c
|
||||
@@ -127,6 +127,8 @@ static void rombios_load(const struct bi
|
||||
uint32_t bioshigh;
|
||||
struct rombios_info *info;
|
||||
|
||||
+ BUILD_BUG_ON(sizeof(rombios) > 0x100000 - ROMBIOS_PHYSICAL_ADDRESS);
|
||||
+
|
||||
memcpy((void *)config->bios_address, config->image,
|
||||
config->image_size);
|
||||
|
||||
@@ -206,8 +208,6 @@ static void rombios_create_smbios_tables
|
||||
SMBIOS_PHYSICAL_END);
|
||||
}
|
||||
|
||||
-//BUILD_BUG_ON(sizeof(rombios) > (0x00100000U - ROMBIOS_PHYSICAL_ADDRESS));
|
||||
-
|
||||
struct bios_config rombios_config = {
|
||||
.name = "ROMBIOS",
|
||||
|
||||
--- a/tools/firmware/hvmloader/seabios.c
|
||||
+++ b/tools/firmware/hvmloader/seabios.c
|
||||
@@ -133,15 +133,13 @@ static void seabios_setup_e820(void)
|
||||
dump_e820_table(e820, info->e820_nr);
|
||||
}
|
||||
|
||||
-//BUILD_BUG_ON(sizeof(seabios) > (0x00100000U - SEABIOS_PHYSICAL_ADDRESS));
|
||||
-
|
||||
struct bios_config seabios_config = {
|
||||
.name = "SeaBIOS",
|
||||
|
||||
.image = seabios,
|
||||
.image_size = sizeof(seabios),
|
||||
|
||||
- .bios_address = SEABIOS_PHYSICAL_ADDRESS,
|
||||
+ .bios_address = 0x100000 - sizeof(seabios),
|
||||
|
||||
.load_roms = NULL,
|
||||
|
@ -1,195 +0,0 @@
|
||||
References: bnc#833251, bnc#834751
|
||||
|
||||
# Commit a350f3f43bcfac9c1591e28d8e43c505fcb172a5
|
||||
# Date 2013-09-09 10:40:11 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/EFI: properly handle run time memory regions outside the 1:1 map
|
||||
|
||||
Namely with PFN compression, MMIO ranges that the firmware may need
|
||||
runtime access to can live in the holes that gets shrunk/eliminated by
|
||||
PFN compression, and hence no mappings would result from simply
|
||||
copying Xen's direct mapping table's L3 page table entries. Build
|
||||
mappings for this "manually" in the EFI runtime call 1:1 page tables.
|
||||
|
||||
Use the opportunity to also properly identify (via a forcibly undefined
|
||||
manifest constant) all the disabled code regions associated with it not
|
||||
being acceptable for us to call SetVirtualAddressMap().
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/xen/arch/x86/efi/boot.c
|
||||
+++ b/xen/arch/x86/efi/boot.c
|
||||
@@ -26,6 +26,9 @@
|
||||
#include <asm/msr.h>
|
||||
#include <asm/processor.h>
|
||||
|
||||
+/* Using SetVirtualAddressMap() is incompatible with kexec: */
|
||||
+#undef USE_SET_VIRTUAL_ADDRESS_MAP
|
||||
+
|
||||
#define SHIM_LOCK_PROTOCOL_GUID \
|
||||
{ 0x605dab50, 0xe046, 0x4300, {0xab, 0xb6, 0x3d, 0xd8, 0x10, 0xdd, 0x8b, 0x23} }
|
||||
|
||||
@@ -1434,7 +1437,7 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SY
|
||||
|
||||
/* Adjust pointers into EFI. */
|
||||
efi_ct = (void *)efi_ct + DIRECTMAP_VIRT_START;
|
||||
-#if 0 /* Only needed when using virtual mode (see efi_init_memory()). */
|
||||
+#ifdef USE_SET_VIRTUAL_ADDRESS_MAP
|
||||
efi_rs = (void *)efi_rs + DIRECTMAP_VIRT_START;
|
||||
#endif
|
||||
efi_memmap = (void *)efi_memmap + DIRECTMAP_VIRT_START;
|
||||
@@ -1477,6 +1480,7 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SY
|
||||
for( ; ; ); /* not reached */
|
||||
}
|
||||
|
||||
+#ifndef USE_SET_VIRTUAL_ADDRESS_MAP
|
||||
static __init void copy_mapping(unsigned long mfn, unsigned long end,
|
||||
bool_t (*is_valid)(unsigned long smfn,
|
||||
unsigned long emfn))
|
||||
@@ -1520,6 +1524,7 @@ static bool_t __init rt_range_valid(unsi
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
+#endif
|
||||
|
||||
#define INVALID_VIRTUAL_ADDRESS (0xBAAADUL << \
|
||||
(EFI_PAGE_SHIFT + BITS_PER_LONG - 32))
|
||||
@@ -1527,6 +1532,13 @@ static bool_t __init rt_range_valid(unsi
|
||||
void __init efi_init_memory(void)
|
||||
{
|
||||
unsigned int i;
|
||||
+#ifndef USE_SET_VIRTUAL_ADDRESS_MAP
|
||||
+ struct rt_extra {
|
||||
+ struct rt_extra *next;
|
||||
+ unsigned long smfn, emfn;
|
||||
+ unsigned int prot;
|
||||
+ } *extra, *extra_head = NULL;
|
||||
+#endif
|
||||
|
||||
printk(XENLOG_INFO "EFI memory map:\n");
|
||||
for ( i = 0; i < efi_memmap_size; i += efi_mdesc_size )
|
||||
@@ -1573,6 +1585,8 @@ void __init efi_init_memory(void)
|
||||
!(smfn & pfn_hole_mask) &&
|
||||
!((smfn ^ (emfn - 1)) & ~pfn_pdx_bottom_mask) )
|
||||
{
|
||||
+ if ( (unsigned long)mfn_to_virt(emfn - 1) >= HYPERVISOR_VIRT_END )
|
||||
+ prot &= ~_PAGE_GLOBAL;
|
||||
if ( map_pages_to_xen((unsigned long)mfn_to_virt(smfn),
|
||||
smfn, emfn - smfn, prot) == 0 )
|
||||
desc->VirtualStart =
|
||||
@@ -1581,15 +1595,29 @@ void __init efi_init_memory(void)
|
||||
printk(XENLOG_ERR "Could not map MFNs %#lx-%#lx\n",
|
||||
smfn, emfn - 1);
|
||||
}
|
||||
+#ifndef USE_SET_VIRTUAL_ADDRESS_MAP
|
||||
+ else if ( !((desc->PhysicalStart + len - 1) >> (VADDR_BITS - 1)) &&
|
||||
+ (extra = xmalloc(struct rt_extra)) != NULL )
|
||||
+ {
|
||||
+ extra->smfn = smfn;
|
||||
+ extra->emfn = emfn;
|
||||
+ extra->prot = prot & ~_PAGE_GLOBAL;
|
||||
+ extra->next = extra_head;
|
||||
+ extra_head = extra;
|
||||
+ desc->VirtualStart = desc->PhysicalStart;
|
||||
+ }
|
||||
+#endif
|
||||
else
|
||||
{
|
||||
+#ifdef USE_SET_VIRTUAL_ADDRESS_MAP
|
||||
/* XXX allocate e.g. down from FIXADDR_START */
|
||||
+#endif
|
||||
printk(XENLOG_ERR "No mapping for MFNs %#lx-%#lx\n",
|
||||
smfn, emfn - 1);
|
||||
}
|
||||
}
|
||||
|
||||
-#if 0 /* Incompatible with kexec. */
|
||||
+#ifdef USE_SET_VIRTUAL_ADDRESS_MAP
|
||||
efi_rs->SetVirtualAddressMap(efi_memmap_size, efi_mdesc_size,
|
||||
mdesc_ver, efi_memmap);
|
||||
#else
|
||||
@@ -1600,20 +1628,74 @@ void __init efi_init_memory(void)
|
||||
|
||||
copy_mapping(0, max_page, ram_range_valid);
|
||||
|
||||
- /* Insert non-RAM runtime mappings. */
|
||||
+ /* Insert non-RAM runtime mappings inside the direct map. */
|
||||
for ( i = 0; i < efi_memmap_size; i += efi_mdesc_size )
|
||||
{
|
||||
const EFI_MEMORY_DESCRIPTOR *desc = efi_memmap + i;
|
||||
|
||||
- if ( desc->Attribute & EFI_MEMORY_RUNTIME )
|
||||
+ if ( (desc->Attribute & EFI_MEMORY_RUNTIME) &&
|
||||
+ desc->VirtualStart != INVALID_VIRTUAL_ADDRESS &&
|
||||
+ desc->VirtualStart != desc->PhysicalStart )
|
||||
+ copy_mapping(PFN_DOWN(desc->PhysicalStart),
|
||||
+ PFN_UP(desc->PhysicalStart +
|
||||
+ (desc->NumberOfPages << EFI_PAGE_SHIFT)),
|
||||
+ rt_range_valid);
|
||||
+ }
|
||||
+
|
||||
+ /* Insert non-RAM runtime mappings outside of the direct map. */
|
||||
+ while ( (extra = extra_head) != NULL )
|
||||
+ {
|
||||
+ unsigned long addr = extra->smfn << PAGE_SHIFT;
|
||||
+ l4_pgentry_t l4e = efi_l4_pgtable[l4_table_offset(addr)];
|
||||
+ l3_pgentry_t *pl3e;
|
||||
+ l2_pgentry_t *pl2e;
|
||||
+ l1_pgentry_t *l1t;
|
||||
+
|
||||
+ if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
|
||||
{
|
||||
- if ( desc->VirtualStart != INVALID_VIRTUAL_ADDRESS )
|
||||
- copy_mapping(PFN_DOWN(desc->PhysicalStart),
|
||||
- PFN_UP(desc->PhysicalStart +
|
||||
- (desc->NumberOfPages << EFI_PAGE_SHIFT)),
|
||||
- rt_range_valid);
|
||||
- else
|
||||
- /* XXX */;
|
||||
+ pl3e = alloc_xen_pagetable();
|
||||
+ BUG_ON(!pl3e);
|
||||
+ clear_page(pl3e);
|
||||
+ efi_l4_pgtable[l4_table_offset(addr)] =
|
||||
+ l4e_from_paddr(virt_to_maddr(pl3e), __PAGE_HYPERVISOR);
|
||||
+ }
|
||||
+ else
|
||||
+ pl3e = l4e_to_l3e(l4e);
|
||||
+ pl3e += l3_table_offset(addr);
|
||||
+ if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
|
||||
+ {
|
||||
+ pl2e = alloc_xen_pagetable();
|
||||
+ BUG_ON(!pl2e);
|
||||
+ clear_page(pl2e);
|
||||
+ *pl3e = l3e_from_paddr(virt_to_maddr(pl2e), __PAGE_HYPERVISOR);
|
||||
+ }
|
||||
+ else
|
||||
+ {
|
||||
+ BUG_ON(l3e_get_flags(*pl3e) & _PAGE_PSE);
|
||||
+ pl2e = l3e_to_l2e(*pl3e);
|
||||
+ }
|
||||
+ pl2e += l2_table_offset(addr);
|
||||
+ if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
|
||||
+ {
|
||||
+ l1t = alloc_xen_pagetable();
|
||||
+ BUG_ON(!l1t);
|
||||
+ clear_page(l1t);
|
||||
+ *pl2e = l2e_from_paddr(virt_to_maddr(l1t), __PAGE_HYPERVISOR);
|
||||
+ }
|
||||
+ else
|
||||
+ {
|
||||
+ BUG_ON(l2e_get_flags(*pl2e) & _PAGE_PSE);
|
||||
+ l1t = l2e_to_l1e(*pl2e);
|
||||
+ }
|
||||
+ for ( i = l1_table_offset(addr);
|
||||
+ i < L1_PAGETABLE_ENTRIES && extra->smfn < extra->emfn;
|
||||
+ ++i, ++extra->smfn )
|
||||
+ l1t[i] = l1e_from_pfn(extra->smfn, extra->prot);
|
||||
+
|
||||
+ if ( extra->smfn == extra->emfn )
|
||||
+ {
|
||||
+ extra_head = extra->next;
|
||||
+ xfree(extra);
|
||||
}
|
||||
}
|
||||
|
@ -1,145 +0,0 @@
|
||||
# Commit a35137373aa9042424565e5ee76dc0a3bb7642ae
|
||||
# Date 2013-09-09 10:43:11 +0200
|
||||
# Author Joby Poriyath <joby.poriyath@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: allow guest to set/clear MSI-X mask bit (try 2)
|
||||
|
||||
Guest needs the ability to enable and disable MSI-X interrupts
|
||||
by setting the MSI-X control bit, for a passed-through device.
|
||||
Guest is allowed to write MSI-X mask bit only if Xen *thinks*
|
||||
that mask is clear (interrupts enabled). If the mask is set by
|
||||
Xen (interrupts disabled), writes to mask bit by the guest is
|
||||
ignored.
|
||||
|
||||
Currently, a write to MSI-X mask bit by the guest is silently
|
||||
ignored.
|
||||
|
||||
A likely scenario is where we have a 82599 SR-IOV nic passed
|
||||
through to a guest. From the guest if you do
|
||||
|
||||
ifconfig <ETH_DEV> down
|
||||
ifconfig <ETH_DEV> up
|
||||
|
||||
the interrupts remain masked. On VF reset, the mask bit is set
|
||||
by the controller. At this point, Xen is not aware that mask is set.
|
||||
However, interrupts are enabled by VF driver by clearing the mask
|
||||
bit by writing directly to BAR3 region containing the MSI-X table.
|
||||
|
||||
From dom0, we can verify that
|
||||
interrupts are being masked using 'xl debug-keys M'.
|
||||
|
||||
Initially, guest was allowed to modify MSI-X bit.
|
||||
Later this behaviour was changed.
|
||||
See changeset 74c213c506afcd74a8556dd092995fd4dc38b225.
|
||||
|
||||
Signed-off-by: Joby Poriyath <joby.poriyath@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/vmsi.c
|
||||
+++ b/xen/arch/x86/hvm/vmsi.c
|
||||
@@ -187,6 +187,19 @@ static struct msixtbl_entry *msixtbl_fin
|
||||
return NULL;
|
||||
}
|
||||
|
||||
+static struct msi_desc *virt_to_msi_desc(struct pci_dev *dev, void *virt)
|
||||
+{
|
||||
+ struct msi_desc *desc;
|
||||
+
|
||||
+ list_for_each_entry( desc, &dev->msi_list, list )
|
||||
+ if ( desc->msi_attrib.type == PCI_CAP_ID_MSIX &&
|
||||
+ virt >= desc->mask_base &&
|
||||
+ virt < desc->mask_base + PCI_MSIX_ENTRY_SIZE )
|
||||
+ return desc;
|
||||
+
|
||||
+ return NULL;
|
||||
+}
|
||||
+
|
||||
static void __iomem *msixtbl_addr_to_virt(
|
||||
struct msixtbl_entry *entry, unsigned long addr)
|
||||
{
|
||||
@@ -247,13 +260,16 @@ out:
|
||||
}
|
||||
|
||||
static int msixtbl_write(struct vcpu *v, unsigned long address,
|
||||
- unsigned long len, unsigned long val)
|
||||
+ unsigned long len, unsigned long val)
|
||||
{
|
||||
unsigned long offset;
|
||||
struct msixtbl_entry *entry;
|
||||
+ const struct msi_desc *msi_desc;
|
||||
void *virt;
|
||||
unsigned int nr_entry, index;
|
||||
int r = X86EMUL_UNHANDLEABLE;
|
||||
+ unsigned long flags, orig;
|
||||
+ struct irq_desc *desc;
|
||||
|
||||
if ( len != 4 || (address & 3) )
|
||||
return r;
|
||||
@@ -283,22 +299,57 @@ static int msixtbl_write(struct vcpu *v,
|
||||
if ( !virt )
|
||||
goto out;
|
||||
|
||||
- /* Do not allow the mask bit to be changed. */
|
||||
-#if 0 /* XXX
|
||||
- * As the mask bit is the only defined bit in the word, and as the
|
||||
- * host MSI-X code doesn't preserve the other bits anyway, doing
|
||||
- * this is pointless. So for now just discard the write (also
|
||||
- * saving us from having to determine the matching irq_desc).
|
||||
- */
|
||||
+ msi_desc = virt_to_msi_desc(entry->pdev, virt);
|
||||
+ if ( !msi_desc || msi_desc->irq < 0 )
|
||||
+ goto out;
|
||||
+
|
||||
+ desc = irq_to_desc(msi_desc->irq);
|
||||
+ if ( !desc )
|
||||
+ goto out;
|
||||
+
|
||||
spin_lock_irqsave(&desc->lock, flags);
|
||||
+
|
||||
+ if ( !desc->msi_desc )
|
||||
+ goto unlock;
|
||||
+
|
||||
+ ASSERT(msi_desc == desc->msi_desc);
|
||||
+
|
||||
orig = readl(virt);
|
||||
- val &= ~PCI_MSIX_VECTOR_BITMASK;
|
||||
- val |= orig & PCI_MSIX_VECTOR_BITMASK;
|
||||
+
|
||||
+ /*
|
||||
+ * Do not allow guest to modify MSI-X control bit if it is masked
|
||||
+ * by Xen. We'll only handle the case where Xen thinks that
|
||||
+ * bit is unmasked, but hardware has silently masked the bit
|
||||
+ * (in case of SR-IOV VF reset, etc). On the other hand, if Xen
|
||||
+ * thinks that the bit is masked, but it's really not,
|
||||
+ * we log a warning.
|
||||
+ */
|
||||
+ if ( msi_desc->msi_attrib.masked )
|
||||
+ {
|
||||
+ if ( !(orig & PCI_MSIX_VECTOR_BITMASK) )
|
||||
+ printk(XENLOG_WARNING "MSI-X control bit is unmasked when"
|
||||
+ " it is expected to be masked [%04x:%02x:%02x.%u]\n",
|
||||
+ entry->pdev->seg, entry->pdev->bus,
|
||||
+ PCI_SLOT(entry->pdev->devfn),
|
||||
+ PCI_FUNC(entry->pdev->devfn));
|
||||
+
|
||||
+ goto unlock;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * The mask bit is the only defined bit in the word. But we
|
||||
+ * ought to preserve the reserved bits. Clearing the reserved
|
||||
+ * bits can result in undefined behaviour (see PCI Local Bus
|
||||
+ * Specification revision 2.3).
|
||||
+ */
|
||||
+ val &= PCI_MSIX_VECTOR_BITMASK;
|
||||
+ val |= (orig & ~PCI_MSIX_VECTOR_BITMASK);
|
||||
writel(val, virt);
|
||||
- spin_unlock_irqrestore(&desc->lock, flags);
|
||||
-#endif
|
||||
|
||||
+unlock:
|
||||
+ spin_unlock_irqrestore(&desc->lock, flags);
|
||||
r = X86EMUL_OKAY;
|
||||
+
|
||||
out:
|
||||
rcu_read_unlock(&msixtbl_rcu_lock);
|
||||
return r;
|
@ -1,27 +0,0 @@
|
||||
# Commit 0fbf3208d9c1a568aeeb61d9f4fbca03b1cfa1f8
|
||||
# Date 2013-09-09 14:34:12 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
xmalloc: make whole pages xfree() clear the order field (ab)used by xmalloc()
|
||||
|
||||
Not doing this was found to cause problems with sequences of allocation
|
||||
(multi-page), freeing, and then again allocation of the same page upon
|
||||
boot when interrupts are still disabled (causing the owner field to be
|
||||
non-zero, thus making the allocator attempt a TLB flush and, in its
|
||||
processing, triggering an assertion).
|
||||
|
||||
Reported-by: Tomasz Wroblewski <tomasz.wroblewski@citrix.com>
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Tested-by: Tomasz Wroblewski <tomasz.wroblewski@citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/common/xmalloc_tlsf.c
|
||||
+++ b/xen/common/xmalloc_tlsf.c
|
||||
@@ -629,6 +629,7 @@ void xfree(void *p)
|
||||
unsigned int i, order = get_order_from_pages(size);
|
||||
|
||||
BUG_ON((unsigned long)p & ((PAGE_SIZE << order) - 1));
|
||||
+ PFN_ORDER(virt_to_page(p)) = 0;
|
||||
for ( i = 0; ; ++i )
|
||||
{
|
||||
if ( !(size & (1 << i)) )
|
@ -1,629 +0,0 @@
|
||||
References: bnc#833796
|
||||
|
||||
# Commit 4cc1344447a0458df5d222960f2adf1b65084fa8
|
||||
# Date 2013-09-09 14:36:54 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/xsave: fix migration from xsave-capable to xsave-incapable host
|
||||
|
||||
With CPUID features suitably masked this is supposed to work, but was
|
||||
completely broken (i.e. the case wasn't even considered when the
|
||||
original xsave save/restore code was written).
|
||||
|
||||
First of all, xsave_enabled() wrongly returned the value of
|
||||
cpu_has_xsave, i.e. not even taking into consideration attributes of
|
||||
the vCPU in question. Instead this function ought to check whether the
|
||||
guest ever enabled xsave support (by writing a [non-zero] value to
|
||||
XCR0). As a result of this, a vCPU's xcr0 and xcr0_accum must no longer
|
||||
be initialized to XSTATE_FP_SSE (since that's a valid value a guest
|
||||
could write to XCR0), and the xsave/xrstor as well as the context
|
||||
switch code need to suitably account for this (by always enforcing at
|
||||
least this part of the state to be saved/loaded).
|
||||
|
||||
This involves undoing large parts of c/s 22945:13a7d1f7f62c ("x86: add
|
||||
strictly sanity check for XSAVE/XRSTOR") - we need to cleanly
|
||||
distinguish between hardware capabilities and vCPU used features.
|
||||
|
||||
Next both HVM and PV save code needed tweaking to not always save the
|
||||
full state supported by the underlying hardware, but just the parts
|
||||
that the guest actually used. Similarly the restore code should bail
|
||||
not just on state being restored that the hardware cannot handle, but
|
||||
also on inconsistent save state (inconsistent XCR0 settings or size of
|
||||
saved state not in line with XCR0).
|
||||
|
||||
And finally the PV extended context get/set code needs to use slightly
|
||||
different logic than the HVM one, as here we can't just key off of
|
||||
xsave_enabled() (i.e. avoid doing anything if a guest doesn't use
|
||||
xsave) because the tools use this function to determine host
|
||||
capabilities as well as read/write vCPU state. The set operation in
|
||||
particular needs to be capable of cleanly dealing with input that
|
||||
consists of only the xcr0 and xcr0_accum values (if they're both zero
|
||||
then no further data is required).
|
||||
|
||||
While for things to work correctly both sides (saving _and_ restoring
|
||||
host) need to run with the fixed code, afaict no breakage should occur
|
||||
if either side isn't up to date (other than the breakage that this
|
||||
patch attempts to fix).
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Yang Zhang <yang.z.zhang@intel.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/domain.c
|
||||
+++ b/xen/arch/x86/domain.c
|
||||
@@ -618,7 +618,7 @@ unsigned long pv_guest_cr4_fixup(const s
|
||||
hv_cr4_mask &= ~X86_CR4_DE;
|
||||
if ( cpu_has_fsgsbase && !is_pv_32bit_domain(v->domain) )
|
||||
hv_cr4_mask &= ~X86_CR4_FSGSBASE;
|
||||
- if ( xsave_enabled(v) )
|
||||
+ if ( cpu_has_xsave )
|
||||
hv_cr4_mask &= ~X86_CR4_OSXSAVE;
|
||||
|
||||
if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) )
|
||||
@@ -1347,9 +1347,13 @@ static void __context_switch(void)
|
||||
if ( !is_idle_vcpu(n) )
|
||||
{
|
||||
memcpy(stack_regs, &n->arch.user_regs, CTXT_SWITCH_STACK_BYTES);
|
||||
- if ( xsave_enabled(n) && n->arch.xcr0 != get_xcr0() &&
|
||||
- !set_xcr0(n->arch.xcr0) )
|
||||
- BUG();
|
||||
+ if ( cpu_has_xsave )
|
||||
+ {
|
||||
+ u64 xcr0 = n->arch.xcr0 ?: XSTATE_FP_SSE;
|
||||
+
|
||||
+ if ( xcr0 != get_xcr0() && !set_xcr0(xcr0) )
|
||||
+ BUG();
|
||||
+ }
|
||||
vcpu_restore_fpu_eager(n);
|
||||
n->arch.ctxt_switch_to(n);
|
||||
}
|
||||
--- a/xen/arch/x86/domctl.c
|
||||
+++ b/xen/arch/x86/domctl.c
|
||||
@@ -1047,11 +1047,8 @@ long arch_do_domctl(
|
||||
struct xen_domctl_vcpuextstate *evc;
|
||||
struct vcpu *v;
|
||||
uint32_t offset = 0;
|
||||
- uint64_t _xfeature_mask = 0;
|
||||
- uint64_t _xcr0, _xcr0_accum;
|
||||
- void *receive_buf = NULL, *_xsave_area;
|
||||
|
||||
-#define PV_XSAVE_SIZE (2 * sizeof(uint64_t) + xsave_cntxt_size)
|
||||
+#define PV_XSAVE_SIZE(xcr0) (2 * sizeof(uint64_t) + xstate_ctxt_size(xcr0))
|
||||
|
||||
evc = &domctl->u.vcpuextstate;
|
||||
|
||||
@@ -1062,15 +1059,16 @@ long arch_do_domctl(
|
||||
|
||||
if ( domctl->cmd == XEN_DOMCTL_getvcpuextstate )
|
||||
{
|
||||
+ unsigned int size = PV_XSAVE_SIZE(v->arch.xcr0_accum);
|
||||
+
|
||||
if ( !evc->size && !evc->xfeature_mask )
|
||||
{
|
||||
evc->xfeature_mask = xfeature_mask;
|
||||
- evc->size = PV_XSAVE_SIZE;
|
||||
+ evc->size = size;
|
||||
ret = 0;
|
||||
goto vcpuextstate_out;
|
||||
}
|
||||
- if ( evc->size != PV_XSAVE_SIZE ||
|
||||
- evc->xfeature_mask != xfeature_mask )
|
||||
+ if ( evc->size != size || evc->xfeature_mask != xfeature_mask )
|
||||
{
|
||||
ret = -EINVAL;
|
||||
goto vcpuextstate_out;
|
||||
@@ -1093,7 +1091,7 @@ long arch_do_domctl(
|
||||
offset += sizeof(v->arch.xcr0_accum);
|
||||
if ( copy_to_guest_offset(domctl->u.vcpuextstate.buffer,
|
||||
offset, (void *)v->arch.xsave_area,
|
||||
- xsave_cntxt_size) )
|
||||
+ size - 2 * sizeof(uint64_t)) )
|
||||
{
|
||||
ret = -EFAULT;
|
||||
goto vcpuextstate_out;
|
||||
@@ -1101,13 +1099,14 @@ long arch_do_domctl(
|
||||
}
|
||||
else
|
||||
{
|
||||
- ret = -EINVAL;
|
||||
+ void *receive_buf;
|
||||
+ uint64_t _xcr0, _xcr0_accum;
|
||||
+ const struct xsave_struct *_xsave_area;
|
||||
|
||||
- _xfeature_mask = evc->xfeature_mask;
|
||||
- /* xsave context must be restored on compatible target CPUs */
|
||||
- if ( (_xfeature_mask & xfeature_mask) != _xfeature_mask )
|
||||
- goto vcpuextstate_out;
|
||||
- if ( evc->size > PV_XSAVE_SIZE || evc->size < 2 * sizeof(uint64_t) )
|
||||
+ ret = -EINVAL;
|
||||
+ if ( evc->size < 2 * sizeof(uint64_t) ||
|
||||
+ evc->size > 2 * sizeof(uint64_t) +
|
||||
+ xstate_ctxt_size(xfeature_mask) )
|
||||
goto vcpuextstate_out;
|
||||
|
||||
receive_buf = xmalloc_bytes(evc->size);
|
||||
@@ -1128,20 +1127,30 @@ long arch_do_domctl(
|
||||
_xcr0_accum = *(uint64_t *)(receive_buf + sizeof(uint64_t));
|
||||
_xsave_area = receive_buf + 2 * sizeof(uint64_t);
|
||||
|
||||
- if ( !(_xcr0 & XSTATE_FP) || _xcr0 & ~xfeature_mask )
|
||||
+ if ( _xcr0_accum )
|
||||
{
|
||||
- xfree(receive_buf);
|
||||
- goto vcpuextstate_out;
|
||||
+ if ( evc->size >= 2 * sizeof(uint64_t) + XSTATE_AREA_MIN_SIZE )
|
||||
+ ret = validate_xstate(_xcr0, _xcr0_accum,
|
||||
+ _xsave_area->xsave_hdr.xstate_bv,
|
||||
+ evc->xfeature_mask);
|
||||
}
|
||||
- if ( (_xcr0 & _xcr0_accum) != _xcr0 )
|
||||
+ else if ( !_xcr0 )
|
||||
+ ret = 0;
|
||||
+ if ( ret )
|
||||
{
|
||||
xfree(receive_buf);
|
||||
goto vcpuextstate_out;
|
||||
}
|
||||
|
||||
- v->arch.xcr0 = _xcr0;
|
||||
- v->arch.xcr0_accum = _xcr0_accum;
|
||||
- memcpy(v->arch.xsave_area, _xsave_area, evc->size - 2 * sizeof(uint64_t) );
|
||||
+ if ( evc->size <= PV_XSAVE_SIZE(_xcr0_accum) )
|
||||
+ {
|
||||
+ v->arch.xcr0 = _xcr0;
|
||||
+ v->arch.xcr0_accum = _xcr0_accum;
|
||||
+ memcpy(v->arch.xsave_area, _xsave_area,
|
||||
+ evc->size - 2 * sizeof(uint64_t));
|
||||
+ }
|
||||
+ else
|
||||
+ ret = -EINVAL;
|
||||
|
||||
xfree(receive_buf);
|
||||
}
|
||||
--- a/xen/arch/x86/hvm/hvm.c
|
||||
+++ b/xen/arch/x86/hvm/hvm.c
|
||||
@@ -906,14 +906,12 @@ static int hvm_load_cpu_ctxt(struct doma
|
||||
hvm_set_segment_register(v, x86_seg_ldtr, &seg);
|
||||
|
||||
/* In case xsave-absent save file is restored on a xsave-capable host */
|
||||
- if ( xsave_enabled(v) )
|
||||
+ if ( cpu_has_xsave && !xsave_enabled(v) )
|
||||
{
|
||||
struct xsave_struct *xsave_area = v->arch.xsave_area;
|
||||
|
||||
memcpy(v->arch.xsave_area, ctxt.fpu_regs, sizeof(ctxt.fpu_regs));
|
||||
xsave_area->xsave_hdr.xstate_bv = XSTATE_FP_SSE;
|
||||
- v->arch.xcr0_accum = XSTATE_FP_SSE;
|
||||
- v->arch.xcr0 = XSTATE_FP_SSE;
|
||||
}
|
||||
else
|
||||
memcpy(v->arch.fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs));
|
||||
@@ -957,7 +955,9 @@ static int hvm_load_cpu_ctxt(struct doma
|
||||
HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, hvm_load_cpu_ctxt,
|
||||
1, HVMSR_PER_VCPU);
|
||||
|
||||
-#define HVM_CPU_XSAVE_SIZE (3 * sizeof(uint64_t) + xsave_cntxt_size)
|
||||
+#define HVM_CPU_XSAVE_SIZE(xcr0) (offsetof(struct hvm_hw_cpu_xsave, \
|
||||
+ save_area) + \
|
||||
+ xstate_ctxt_size(xcr0))
|
||||
|
||||
static int hvm_save_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h)
|
||||
{
|
||||
@@ -969,20 +969,20 @@ static int hvm_save_cpu_xsave_states(str
|
||||
|
||||
for_each_vcpu ( d, v )
|
||||
{
|
||||
+ unsigned int size = HVM_CPU_XSAVE_SIZE(v->arch.xcr0_accum);
|
||||
+
|
||||
if ( !xsave_enabled(v) )
|
||||
continue;
|
||||
- if ( _hvm_init_entry(h, CPU_XSAVE_CODE, v->vcpu_id, HVM_CPU_XSAVE_SIZE) )
|
||||
+ if ( _hvm_init_entry(h, CPU_XSAVE_CODE, v->vcpu_id, size) )
|
||||
return 1;
|
||||
ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur];
|
||||
- h->cur += HVM_CPU_XSAVE_SIZE;
|
||||
- memset(ctxt, 0, HVM_CPU_XSAVE_SIZE);
|
||||
+ h->cur += size;
|
||||
|
||||
ctxt->xfeature_mask = xfeature_mask;
|
||||
ctxt->xcr0 = v->arch.xcr0;
|
||||
ctxt->xcr0_accum = v->arch.xcr0_accum;
|
||||
- if ( v->fpu_initialised )
|
||||
- memcpy(&ctxt->save_area,
|
||||
- v->arch.xsave_area, xsave_cntxt_size);
|
||||
+ memcpy(&ctxt->save_area, v->arch.xsave_area,
|
||||
+ size - offsetof(struct hvm_hw_cpu_xsave, save_area));
|
||||
}
|
||||
|
||||
return 0;
|
||||
@@ -990,11 +990,11 @@ static int hvm_save_cpu_xsave_states(str
|
||||
|
||||
static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h)
|
||||
{
|
||||
- int vcpuid;
|
||||
+ unsigned int vcpuid, size;
|
||||
+ int err;
|
||||
struct vcpu *v;
|
||||
struct hvm_hw_cpu_xsave *ctxt;
|
||||
struct hvm_save_descriptor *desc;
|
||||
- uint64_t _xfeature_mask;
|
||||
|
||||
/* Which vcpu is this? */
|
||||
vcpuid = hvm_load_instance(h);
|
||||
@@ -1006,47 +1006,74 @@ static int hvm_load_cpu_xsave_states(str
|
||||
}
|
||||
|
||||
/* Fails since we can't restore an img saved on xsave-capable host. */
|
||||
- if ( !xsave_enabled(v) )
|
||||
- return -EINVAL;
|
||||
+ if ( !cpu_has_xsave )
|
||||
+ return -EOPNOTSUPP;
|
||||
|
||||
/* Customized checking for entry since our entry is of variable length */
|
||||
desc = (struct hvm_save_descriptor *)&h->data[h->cur];
|
||||
if ( sizeof (*desc) > h->size - h->cur)
|
||||
{
|
||||
printk(XENLOG_G_WARNING
|
||||
- "HVM%d restore: not enough data left to read descriptor"
|
||||
- "for type %u\n", d->domain_id, CPU_XSAVE_CODE);
|
||||
- return -1;
|
||||
+ "HVM%d.%d restore: not enough data left to read xsave descriptor\n",
|
||||
+ d->domain_id, vcpuid);
|
||||
+ return -ENODATA;
|
||||
}
|
||||
if ( desc->length + sizeof (*desc) > h->size - h->cur)
|
||||
{
|
||||
printk(XENLOG_G_WARNING
|
||||
- "HVM%d restore: not enough data left to read %u bytes "
|
||||
- "for type %u\n", d->domain_id, desc->length, CPU_XSAVE_CODE);
|
||||
- return -1;
|
||||
+ "HVM%d.%d restore: not enough data left to read %u xsave bytes\n",
|
||||
+ d->domain_id, vcpuid, desc->length);
|
||||
+ return -ENODATA;
|
||||
+ }
|
||||
+ if ( desc->length < offsetof(struct hvm_hw_cpu_xsave, save_area) +
|
||||
+ XSTATE_AREA_MIN_SIZE )
|
||||
+ {
|
||||
+ printk(XENLOG_G_WARNING
|
||||
+ "HVM%d.%d restore mismatch: xsave length %u < %zu\n",
|
||||
+ d->domain_id, vcpuid, desc->length,
|
||||
+ offsetof(struct hvm_hw_cpu_xsave,
|
||||
+ save_area) + XSTATE_AREA_MIN_SIZE);
|
||||
+ return -EINVAL;
|
||||
}
|
||||
- if ( CPU_XSAVE_CODE != desc->typecode || (desc->length > HVM_CPU_XSAVE_SIZE) )
|
||||
+ size = HVM_CPU_XSAVE_SIZE(xfeature_mask);
|
||||
+ if ( desc->length > size )
|
||||
{
|
||||
printk(XENLOG_G_WARNING
|
||||
- "HVM%d restore mismatch: expected type %u with max length %u, "
|
||||
- "saw type %u length %u\n", d->domain_id, CPU_XSAVE_CODE,
|
||||
- (unsigned int)HVM_CPU_XSAVE_SIZE,
|
||||
- desc->typecode, desc->length);
|
||||
- return -1;
|
||||
+ "HVM%d.%d restore mismatch: xsave length %u > %u\n",
|
||||
+ d->domain_id, vcpuid, desc->length, size);
|
||||
+ return -EOPNOTSUPP;
|
||||
}
|
||||
h->cur += sizeof (*desc);
|
||||
- /* Checking finished */
|
||||
|
||||
ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur];
|
||||
h->cur += desc->length;
|
||||
|
||||
- _xfeature_mask = ctxt->xfeature_mask;
|
||||
- if ( (_xfeature_mask & xfeature_mask) != _xfeature_mask )
|
||||
- return -EINVAL;
|
||||
+ err = validate_xstate(ctxt->xcr0, ctxt->xcr0_accum,
|
||||
+ ctxt->save_area.xsave_hdr.xstate_bv,
|
||||
+ ctxt->xfeature_mask);
|
||||
+ if ( err )
|
||||
+ {
|
||||
+ printk(XENLOG_G_WARNING
|
||||
+ "HVM%d.%d restore: inconsistent xsave state (feat=%#"PRIx64
|
||||
+ " accum=%#"PRIx64" xcr0=%#"PRIx64" bv=%#"PRIx64" err=%d)\n",
|
||||
+ d->domain_id, vcpuid, ctxt->xfeature_mask, ctxt->xcr0_accum,
|
||||
+ ctxt->xcr0, ctxt->save_area.xsave_hdr.xstate_bv, err);
|
||||
+ return err;
|
||||
+ }
|
||||
+ size = HVM_CPU_XSAVE_SIZE(ctxt->xcr0_accum);
|
||||
+ if ( desc->length > size )
|
||||
+ {
|
||||
+ printk(XENLOG_G_WARNING
|
||||
+ "HVM%d.%d restore mismatch: xsave length %u > %u\n",
|
||||
+ d->domain_id, vcpuid, desc->length, size);
|
||||
+ return -EOPNOTSUPP;
|
||||
+ }
|
||||
+ /* Checking finished */
|
||||
|
||||
v->arch.xcr0 = ctxt->xcr0;
|
||||
v->arch.xcr0_accum = ctxt->xcr0_accum;
|
||||
- memcpy(v->arch.xsave_area, &ctxt->save_area, xsave_cntxt_size);
|
||||
+ memcpy(v->arch.xsave_area, &ctxt->save_area,
|
||||
+ desc->length - offsetof(struct hvm_hw_cpu_xsave, save_area));
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1060,7 +1087,8 @@ static int __init __hvm_register_CPU_XSA
|
||||
"CPU_XSAVE",
|
||||
hvm_save_cpu_xsave_states,
|
||||
hvm_load_cpu_xsave_states,
|
||||
- HVM_CPU_XSAVE_SIZE + sizeof (struct hvm_save_descriptor),
|
||||
+ HVM_CPU_XSAVE_SIZE(xfeature_mask) +
|
||||
+ sizeof(struct hvm_save_descriptor),
|
||||
HVMSR_PER_VCPU);
|
||||
return 0;
|
||||
}
|
||||
@@ -2767,7 +2795,7 @@ void hvm_cpuid(unsigned int input, unsig
|
||||
__clear_bit(X86_FEATURE_APIC & 31, edx);
|
||||
|
||||
/* Fix up OSXSAVE. */
|
||||
- if ( xsave_enabled(v) )
|
||||
+ if ( cpu_has_xsave )
|
||||
*ecx |= (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE) ?
|
||||
cpufeat_mask(X86_FEATURE_OSXSAVE) : 0;
|
||||
|
||||
--- a/xen/arch/x86/hvm/vmx/vmcs.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
|
||||
@@ -947,8 +947,7 @@ static int construct_vmcs(struct vcpu *v
|
||||
/* Host control registers. */
|
||||
v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS;
|
||||
__vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
|
||||
- __vmwrite(HOST_CR4,
|
||||
- mmu_cr4_features | (xsave_enabled(v) ? X86_CR4_OSXSAVE : 0));
|
||||
+ __vmwrite(HOST_CR4, mmu_cr4_features);
|
||||
|
||||
/* Host CS:RIP. */
|
||||
__vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS);
|
||||
--- a/xen/arch/x86/i387.c
|
||||
+++ b/xen/arch/x86/i387.c
|
||||
@@ -38,14 +38,15 @@ static inline void fpu_xrstor(struct vcp
|
||||
{
|
||||
bool_t ok;
|
||||
|
||||
+ ASSERT(v->arch.xsave_area);
|
||||
/*
|
||||
* XCR0 normally represents what guest OS set. In case of Xen itself,
|
||||
- * we set all supported feature mask before doing save/restore.
|
||||
+ * we set the accumulated feature mask before doing save/restore.
|
||||
*/
|
||||
- ok = set_xcr0(v->arch.xcr0_accum);
|
||||
+ ok = set_xcr0(v->arch.xcr0_accum | XSTATE_FP_SSE);
|
||||
ASSERT(ok);
|
||||
xrstor(v, mask);
|
||||
- ok = set_xcr0(v->arch.xcr0);
|
||||
+ ok = set_xcr0(v->arch.xcr0 ?: XSTATE_FP_SSE);
|
||||
ASSERT(ok);
|
||||
}
|
||||
|
||||
@@ -124,13 +125,15 @@ static inline void fpu_xsave(struct vcpu
|
||||
{
|
||||
bool_t ok;
|
||||
|
||||
- /* XCR0 normally represents what guest OS set. In case of Xen itself,
|
||||
- * we set all accumulated feature mask before doing save/restore.
|
||||
+ ASSERT(v->arch.xsave_area);
|
||||
+ /*
|
||||
+ * XCR0 normally represents what guest OS set. In case of Xen itself,
|
||||
+ * we set the accumulated feature mask before doing save/restore.
|
||||
*/
|
||||
- ok = set_xcr0(v->arch.xcr0_accum);
|
||||
+ ok = set_xcr0(v->arch.xcr0_accum | XSTATE_FP_SSE);
|
||||
ASSERT(ok);
|
||||
xsave(v, v->arch.nonlazy_xstate_used ? XSTATE_ALL : XSTATE_LAZY);
|
||||
- ok = set_xcr0(v->arch.xcr0);
|
||||
+ ok = set_xcr0(v->arch.xcr0 ?: XSTATE_FP_SSE);
|
||||
ASSERT(ok);
|
||||
}
|
||||
|
||||
@@ -238,7 +241,7 @@ void vcpu_restore_fpu_lazy(struct vcpu *
|
||||
if ( v->fpu_dirtied )
|
||||
return;
|
||||
|
||||
- if ( xsave_enabled(v) )
|
||||
+ if ( cpu_has_xsave )
|
||||
fpu_xrstor(v, XSTATE_LAZY);
|
||||
else if ( v->fpu_initialised )
|
||||
{
|
||||
@@ -268,7 +271,7 @@ void vcpu_save_fpu(struct vcpu *v)
|
||||
/* This can happen, if a paravirtualised guest OS has set its CR0.TS. */
|
||||
clts();
|
||||
|
||||
- if ( xsave_enabled(v) )
|
||||
+ if ( cpu_has_xsave )
|
||||
fpu_xsave(v);
|
||||
else if ( cpu_has_fxsr )
|
||||
fpu_fxsave(v);
|
||||
--- a/xen/arch/x86/traps.c
|
||||
+++ b/xen/arch/x86/traps.c
|
||||
@@ -816,7 +816,7 @@ static void pv_cpuid(struct cpu_user_reg
|
||||
__clear_bit(X86_FEATURE_PDCM % 32, &c);
|
||||
__clear_bit(X86_FEATURE_PCID % 32, &c);
|
||||
__clear_bit(X86_FEATURE_DCA % 32, &c);
|
||||
- if ( !xsave_enabled(current) )
|
||||
+ if ( !cpu_has_xsave )
|
||||
{
|
||||
__clear_bit(X86_FEATURE_XSAVE % 32, &c);
|
||||
__clear_bit(X86_FEATURE_AVX % 32, &c);
|
||||
@@ -841,7 +841,7 @@ static void pv_cpuid(struct cpu_user_reg
|
||||
break;
|
||||
|
||||
case 0x0000000d: /* XSAVE */
|
||||
- if ( !xsave_enabled(current) )
|
||||
+ if ( !cpu_has_xsave )
|
||||
goto unsupported;
|
||||
break;
|
||||
|
||||
--- a/xen/arch/x86/xstate.c
|
||||
+++ b/xen/arch/x86/xstate.c
|
||||
@@ -21,7 +21,7 @@ bool_t __read_mostly cpu_has_xsaveopt;
|
||||
* the supported and enabled features on the processor, including the
|
||||
* XSAVE.HEADER. We only enable XCNTXT_MASK that we have known.
|
||||
*/
|
||||
-u32 xsave_cntxt_size;
|
||||
+static u32 __read_mostly xsave_cntxt_size;
|
||||
|
||||
/* A 64-bit bitmask of the XSAVE/XRSTOR features supported by processor. */
|
||||
u64 xfeature_mask;
|
||||
@@ -206,13 +206,13 @@ void xrstor(struct vcpu *v, uint64_t mas
|
||||
|
||||
bool_t xsave_enabled(const struct vcpu *v)
|
||||
{
|
||||
- if ( cpu_has_xsave )
|
||||
- {
|
||||
- ASSERT(xsave_cntxt_size >= XSTATE_AREA_MIN_SIZE);
|
||||
- ASSERT(v->arch.xsave_area);
|
||||
- }
|
||||
+ if ( !cpu_has_xsave )
|
||||
+ return 0;
|
||||
|
||||
- return cpu_has_xsave;
|
||||
+ ASSERT(xsave_cntxt_size >= XSTATE_AREA_MIN_SIZE);
|
||||
+ ASSERT(v->arch.xsave_area);
|
||||
+
|
||||
+ return !!v->arch.xcr0_accum;
|
||||
}
|
||||
|
||||
int xstate_alloc_save_area(struct vcpu *v)
|
||||
@@ -234,8 +234,8 @@ int xstate_alloc_save_area(struct vcpu *
|
||||
save_area->xsave_hdr.xstate_bv = XSTATE_FP_SSE;
|
||||
|
||||
v->arch.xsave_area = save_area;
|
||||
- v->arch.xcr0 = XSTATE_FP_SSE;
|
||||
- v->arch.xcr0_accum = XSTATE_FP_SSE;
|
||||
+ v->arch.xcr0 = 0;
|
||||
+ v->arch.xcr0_accum = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -253,7 +253,11 @@ void xstate_init(bool_t bsp)
|
||||
u64 feature_mask;
|
||||
|
||||
if ( boot_cpu_data.cpuid_level < XSTATE_CPUID )
|
||||
+ {
|
||||
+ BUG_ON(!bsp);
|
||||
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
|
||||
return;
|
||||
+ }
|
||||
|
||||
cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
@@ -273,7 +277,6 @@ void xstate_init(bool_t bsp)
|
||||
set_in_cr4(X86_CR4_OSXSAVE);
|
||||
if ( !set_xcr0(feature_mask) )
|
||||
BUG();
|
||||
- cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
if ( bsp )
|
||||
{
|
||||
@@ -282,14 +285,14 @@ void xstate_init(bool_t bsp)
|
||||
* xsave_cntxt_size is the max size required by enabled features.
|
||||
* We know FP/SSE and YMM about eax, and nothing about edx at present.
|
||||
*/
|
||||
- xsave_cntxt_size = ebx;
|
||||
+ xsave_cntxt_size = xstate_ctxt_size(feature_mask);
|
||||
printk("%s: using cntxt_size: %#x and states: %#"PRIx64"\n",
|
||||
__func__, xsave_cntxt_size, xfeature_mask);
|
||||
}
|
||||
else
|
||||
{
|
||||
BUG_ON(xfeature_mask != feature_mask);
|
||||
- BUG_ON(xsave_cntxt_size != ebx);
|
||||
+ BUG_ON(xsave_cntxt_size != xstate_ctxt_size(feature_mask));
|
||||
}
|
||||
|
||||
/* Check XSAVEOPT feature. */
|
||||
@@ -300,6 +303,42 @@ void xstate_init(bool_t bsp)
|
||||
BUG_ON(!cpu_has_xsaveopt != !(eax & XSTATE_FEATURE_XSAVEOPT));
|
||||
}
|
||||
|
||||
+unsigned int xstate_ctxt_size(u64 xcr0)
|
||||
+{
|
||||
+ u32 ebx = 0;
|
||||
+
|
||||
+ if ( xcr0 )
|
||||
+ {
|
||||
+ u64 act_xcr0 = get_xcr0();
|
||||
+ u32 eax, ecx, edx;
|
||||
+ bool_t ok = set_xcr0(xcr0);
|
||||
+
|
||||
+ ASSERT(ok);
|
||||
+ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
|
||||
+ ASSERT(ebx <= ecx);
|
||||
+ ok = set_xcr0(act_xcr0);
|
||||
+ ASSERT(ok);
|
||||
+ }
|
||||
+
|
||||
+ return ebx;
|
||||
+}
|
||||
+
|
||||
+int validate_xstate(u64 xcr0, u64 xcr0_accum, u64 xstate_bv, u64 xfeat_mask)
|
||||
+{
|
||||
+ if ( (xcr0_accum & ~xfeat_mask) ||
|
||||
+ (xstate_bv & ~xcr0_accum) ||
|
||||
+ (xcr0 & ~xcr0_accum) ||
|
||||
+ !(xcr0 & XSTATE_FP) ||
|
||||
+ ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) ||
|
||||
+ ((xcr0_accum & XSTATE_YMM) && !(xcr0_accum & XSTATE_SSE)) )
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ if ( xcr0_accum & ~xfeature_mask )
|
||||
+ return -EOPNOTSUPP;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
int handle_xsetbv(u32 index, u64 new_bv)
|
||||
{
|
||||
struct vcpu *curr = current;
|
||||
--- a/xen/include/asm-x86/domain.h
|
||||
+++ b/xen/include/asm-x86/domain.h
|
||||
@@ -456,9 +456,9 @@ unsigned long pv_guest_cr4_fixup(const s
|
||||
#define pv_guest_cr4_to_real_cr4(v) \
|
||||
(((v)->arch.pv_vcpu.ctrlreg[4] \
|
||||
| (mmu_cr4_features \
|
||||
- & (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_SMEP)) \
|
||||
- | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0) \
|
||||
- | ((xsave_enabled(v))? X86_CR4_OSXSAVE : 0)) \
|
||||
+ & (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_SMEP | \
|
||||
+ X86_CR4_OSXSAVE)) \
|
||||
+ | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)) \
|
||||
& ~X86_CR4_DE)
|
||||
#define real_cr4_to_pv_guest_cr4(c) \
|
||||
((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD \
|
||||
--- a/xen/include/asm-x86/hvm/hvm.h
|
||||
+++ b/xen/include/asm-x86/hvm/hvm.h
|
||||
@@ -368,7 +368,7 @@ static inline int hvm_event_pending(stru
|
||||
((nestedhvm_enabled((_v)->domain) && cpu_has_vmx)\
|
||||
? X86_CR4_VMXE : 0) | \
|
||||
(cpu_has_pcid ? X86_CR4_PCIDE : 0) | \
|
||||
- (xsave_enabled(_v) ? X86_CR4_OSXSAVE : 0))))
|
||||
+ (cpu_has_xsave ? X86_CR4_OSXSAVE : 0))))
|
||||
|
||||
/* These exceptions must always be intercepted. */
|
||||
#define HVM_TRAP_MASK ((1U << TRAP_machine_check) | (1U << TRAP_invalid_op))
|
||||
--- a/xen/include/asm-x86/xstate.h
|
||||
+++ b/xen/include/asm-x86/xstate.h
|
||||
@@ -33,7 +33,6 @@
|
||||
#define XSTATE_NONLAZY (XSTATE_LWP)
|
||||
#define XSTATE_LAZY (XSTATE_ALL & ~XSTATE_NONLAZY)
|
||||
|
||||
-extern unsigned int xsave_cntxt_size;
|
||||
extern u64 xfeature_mask;
|
||||
|
||||
/* extended state save area */
|
||||
@@ -76,11 +75,14 @@ uint64_t get_xcr0(void);
|
||||
void xsave(struct vcpu *v, uint64_t mask);
|
||||
void xrstor(struct vcpu *v, uint64_t mask);
|
||||
bool_t xsave_enabled(const struct vcpu *v);
|
||||
+int __must_check validate_xstate(u64 xcr0, u64 xcr0_accum, u64 xstate_bv,
|
||||
+ u64 xfeat_mask);
|
||||
int __must_check handle_xsetbv(u32 index, u64 new_bv);
|
||||
|
||||
/* extended state init and cleanup functions */
|
||||
void xstate_free_save_area(struct vcpu *v);
|
||||
int xstate_alloc_save_area(struct vcpu *v);
|
||||
void xstate_init(bool_t bsp);
|
||||
+unsigned int xstate_ctxt_size(u64 xcr0);
|
||||
|
||||
#endif /* __ASM_XSTATE_H */
|
@ -1,25 +0,0 @@
|
||||
# Commit f3a4eb9253826d1e49e682314c8666b28fa0b717
|
||||
# Date 2013-09-10 16:41:35 +0200
|
||||
# Author Yang Zhang <yang.z.zhang@Intel.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
Nested VMX: Clear bit 31 of IA32_VMX_BASIC MSR
|
||||
|
||||
The bit 31 of revision_id will set to 1 if vmcs shadowing enabled. And
|
||||
according intel SDM, the bit 31 of IA32_VMX_BASIC MSR is always 0. So we
|
||||
cannot set low 32 bit of IA32_VMX_BASIC to revision_id directly. Must clear
|
||||
the bit 31 to 0.
|
||||
|
||||
Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/vmx/vvmx.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
|
||||
@@ -1828,7 +1828,7 @@ int nvmx_msr_read_intercept(unsigned int
|
||||
switch (msr) {
|
||||
case MSR_IA32_VMX_BASIC:
|
||||
data = (host_data & (~0ul << 32)) |
|
||||
- ((v->arch.hvm_vmx.vmcs)->vmcs_revision_id);
|
||||
+ (v->arch.hvm_vmx.vmcs->vmcs_revision_id & 0x7fffffff);
|
||||
break;
|
||||
case MSR_IA32_VMX_PINBASED_CTLS:
|
||||
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
|
@ -1,41 +0,0 @@
|
||||
# Commit 546ba2f17008387cf9821df46e6dac04f0883a9b
|
||||
# Date 2013-09-10 17:16:02 +0200
|
||||
# Author Matthew Daley <mattjd@gmail.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
sched/arinc653: check for guest data transfer failures
|
||||
|
||||
Coverity-ID: 1055121
|
||||
Coverity-ID: 1055122
|
||||
Coverity-ID: 1055123
|
||||
Coverity-ID: 1055124
|
||||
Signed-off-by: Matthew Daley <mattjd@gmail.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: George Dunlap <george.dunlap@eu.citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/common/sched_arinc653.c
|
||||
+++ b/xen/common/sched_arinc653.c
|
||||
@@ -635,12 +635,21 @@ a653sched_adjust_global(const struct sch
|
||||
switch ( sc->cmd )
|
||||
{
|
||||
case XEN_SYSCTL_SCHEDOP_putinfo:
|
||||
- copy_from_guest(&local_sched, sc->u.sched_arinc653.schedule, 1);
|
||||
+ if ( copy_from_guest(&local_sched, sc->u.sched_arinc653.schedule, 1) )
|
||||
+ {
|
||||
+ rc = -EFAULT;
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
rc = arinc653_sched_set(ops, &local_sched);
|
||||
break;
|
||||
case XEN_SYSCTL_SCHEDOP_getinfo:
|
||||
rc = arinc653_sched_get(ops, &local_sched);
|
||||
- copy_to_guest(sc->u.sched_arinc653.schedule, &local_sched, 1);
|
||||
+ if ( rc )
|
||||
+ break;
|
||||
+
|
||||
+ if ( copy_to_guest(sc->u.sched_arinc653.schedule, &local_sched, 1) )
|
||||
+ rc = -EFAULT;
|
||||
break;
|
||||
}
|
||||
|
@ -1,116 +0,0 @@
|
||||
References: bnc#839600
|
||||
|
||||
# Commit 8efce9d69998a3d3c720ac7dbdb9b7e240369957
|
||||
# Date 2013-09-12 09:52:53 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: fix memory cut-off when using PFN compression
|
||||
|
||||
For one setup_max_pdx(), when invoked a second time (after SRAT got
|
||||
parsed), needs to start from the original max_page value again (using
|
||||
the already adjusted one from the first invocation would not allow the
|
||||
cut-off boundary to be moved up).
|
||||
|
||||
Second, _if_ we need to cut off some part of memory, we must not allow
|
||||
this to also propagate into the NUMA accounting. Otherwise
|
||||
cutoff_node() results in nodes_cover_memory() to find some parts of
|
||||
memory apparently not having a PXM association, causing all SRAT info
|
||||
to be ignored.
|
||||
|
||||
The only possibly problematic consumer of node_spanned_pages (the
|
||||
meaning of which gets altered here in that it now also includes memory
|
||||
Xen can't actively make use of) is XEN_SYSCTL_numainfo: At a first
|
||||
glance the potentially larger reported memory size shouldn't confuse
|
||||
tool stacks.
|
||||
|
||||
And finally we must not put our boot time modules at addresses which
|
||||
(at that time) can't be guaranteed to be accessible later. This applies
|
||||
to both the EFI boot loader and the module relocation code.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
Acked-by: Dario Faggioli <dario.faggioli@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/efi/boot.c
|
||||
+++ b/xen/arch/x86/efi/boot.c
|
||||
@@ -459,7 +459,8 @@ static bool_t __init read_file(EFI_FILE_
|
||||
what = what ?: L"Seek";
|
||||
else
|
||||
{
|
||||
- file->addr = (EFI_PHYSICAL_ADDRESS)1 << (32 + PAGE_SHIFT);
|
||||
+ file->addr = min(1UL << (32 + PAGE_SHIFT),
|
||||
+ HYPERVISOR_VIRT_END - DIRECTMAP_VIRT_START);
|
||||
ret = efi_bs->AllocatePages(AllocateMaxAddress, EfiLoaderData,
|
||||
PFN_UP(size), &file->addr);
|
||||
}
|
||||
--- a/xen/arch/x86/setup.c
|
||||
+++ b/xen/arch/x86/setup.c
|
||||
@@ -377,9 +377,9 @@ static uint64_t __init consider_modules(
|
||||
return e;
|
||||
}
|
||||
|
||||
-static void __init setup_max_pdx(void)
|
||||
+static void __init setup_max_pdx(unsigned long top_page)
|
||||
{
|
||||
- max_pdx = pfn_to_pdx(max_page - 1) + 1;
|
||||
+ max_pdx = pfn_to_pdx(top_page - 1) + 1;
|
||||
|
||||
if ( max_pdx > (DIRECTMAP_SIZE >> PAGE_SHIFT) )
|
||||
max_pdx = DIRECTMAP_SIZE >> PAGE_SHIFT;
|
||||
@@ -547,7 +547,7 @@ void __init __start_xen(unsigned long mb
|
||||
unsigned int initrdidx;
|
||||
multiboot_info_t *mbi = __va(mbi_p);
|
||||
module_t *mod = (module_t *)__va(mbi->mods_addr);
|
||||
- unsigned long nr_pages, modules_headroom, *module_map;
|
||||
+ unsigned long nr_pages, raw_max_page, modules_headroom, *module_map;
|
||||
int i, j, e820_warn = 0, bytes = 0;
|
||||
bool_t acpi_boot_table_init_done = 0;
|
||||
struct ns16550_defaults ns16550 = {
|
||||
@@ -751,7 +751,7 @@ void __init __start_xen(unsigned long mb
|
||||
}
|
||||
|
||||
/* Sanitise the raw E820 map to produce a final clean version. */
|
||||
- max_page = init_e820(memmap_type, e820_raw, &e820_raw_nr);
|
||||
+ max_page = raw_max_page = init_e820(memmap_type, e820_raw, &e820_raw_nr);
|
||||
|
||||
/* Create a temporary copy of the E820 map. */
|
||||
memcpy(&boot_e820, &e820, sizeof(e820));
|
||||
@@ -820,7 +820,10 @@ void __init __start_xen(unsigned long mb
|
||||
(end - s) >> PAGE_SHIFT, PAGE_HYPERVISOR);
|
||||
}
|
||||
|
||||
- e = min_t(uint64_t, e, 1ULL << (PAGE_SHIFT + 32));
|
||||
+ if ( e > min(HYPERVISOR_VIRT_END - DIRECTMAP_VIRT_START,
|
||||
+ 1UL << (PAGE_SHIFT + 32)) )
|
||||
+ e = min(HYPERVISOR_VIRT_END - DIRECTMAP_VIRT_START,
|
||||
+ 1UL << (PAGE_SHIFT + 32));
|
||||
#define reloc_size ((__pa(&_end) + mask) & ~mask)
|
||||
/* Is the region suitable for relocating Xen? */
|
||||
if ( !xen_phys_start && e <= limit )
|
||||
@@ -969,7 +972,7 @@ void __init __start_xen(unsigned long mb
|
||||
/* Late kexec reservation (dynamic start address). */
|
||||
kexec_reserve_area(&boot_e820);
|
||||
|
||||
- setup_max_pdx();
|
||||
+ setup_max_pdx(raw_max_page);
|
||||
if ( highmem_start )
|
||||
xenheap_max_mfn(PFN_DOWN(highmem_start));
|
||||
|
||||
@@ -995,7 +998,7 @@ void __init __start_xen(unsigned long mb
|
||||
{
|
||||
acpi_boot_table_init_done = 1;
|
||||
srat_parse_regions(s);
|
||||
- setup_max_pdx();
|
||||
+ setup_max_pdx(raw_max_page);
|
||||
}
|
||||
|
||||
if ( pfn_to_pdx((e - 1) >> PAGE_SHIFT) >= max_pdx )
|
||||
@@ -1133,7 +1136,7 @@ void __init __start_xen(unsigned long mb
|
||||
|
||||
acpi_numa_init();
|
||||
|
||||
- numa_initmem_init(0, max_page);
|
||||
+ numa_initmem_init(0, raw_max_page);
|
||||
|
||||
end_boot_allocator();
|
||||
system_state = SYS_STATE_boot;
|
@ -1,94 +0,0 @@
|
||||
# Commit 06d086832155fc7f5344e9d108b979de34674d11
|
||||
# Date 2013-09-12 17:41:04 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
libxc/x86: fix page table creation for huge guests
|
||||
|
||||
The switch-over logic from one page directory to the next was wrong;
|
||||
it needs to be deferred until we actually reach the last page within
|
||||
a given region, instead of being done when the last entry of a page
|
||||
directory gets started with.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Tested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
|
||||
Acked-by: Ian Jackson <ian.jackson@eu.citrix.com>
|
||||
|
||||
--- a/tools/libxc/xc_dom_x86.c
|
||||
+++ b/tools/libxc/xc_dom_x86.c
|
||||
@@ -251,7 +251,7 @@ static int setup_pgtables_x86_32_pae(str
|
||||
l3_pgentry_64_t *l3tab;
|
||||
l2_pgentry_64_t *l2tab = NULL;
|
||||
l1_pgentry_64_t *l1tab = NULL;
|
||||
- unsigned long l3off, l2off, l1off;
|
||||
+ unsigned long l3off, l2off = 0, l1off;
|
||||
xen_vaddr_t addr;
|
||||
xen_pfn_t pgpfn;
|
||||
xen_pfn_t l3mfn = xc_dom_p2m_guest(dom, l3pfn);
|
||||
@@ -299,8 +299,6 @@ static int setup_pgtables_x86_32_pae(str
|
||||
l2off = l2_table_offset_pae(addr);
|
||||
l2tab[l2off] =
|
||||
pfn_to_paddr(xc_dom_p2m_guest(dom, l1pfn)) | L2_PROT;
|
||||
- if ( l2off == (L2_PAGETABLE_ENTRIES_PAE - 1) )
|
||||
- l2tab = NULL;
|
||||
l1pfn++;
|
||||
}
|
||||
|
||||
@@ -312,8 +310,13 @@ static int setup_pgtables_x86_32_pae(str
|
||||
if ( (addr >= dom->pgtables_seg.vstart) &&
|
||||
(addr < dom->pgtables_seg.vend) )
|
||||
l1tab[l1off] &= ~_PAGE_RW; /* page tables are r/o */
|
||||
+
|
||||
if ( l1off == (L1_PAGETABLE_ENTRIES_PAE - 1) )
|
||||
+ {
|
||||
l1tab = NULL;
|
||||
+ if ( l2off == (L2_PAGETABLE_ENTRIES_PAE - 1) )
|
||||
+ l2tab = NULL;
|
||||
+ }
|
||||
}
|
||||
|
||||
if ( dom->virt_pgtab_end <= 0xc0000000 )
|
||||
@@ -360,7 +363,7 @@ static int setup_pgtables_x86_64(struct
|
||||
l3_pgentry_64_t *l3tab = NULL;
|
||||
l2_pgentry_64_t *l2tab = NULL;
|
||||
l1_pgentry_64_t *l1tab = NULL;
|
||||
- uint64_t l4off, l3off, l2off, l1off;
|
||||
+ uint64_t l4off, l3off = 0, l2off = 0, l1off;
|
||||
uint64_t addr;
|
||||
xen_pfn_t pgpfn;
|
||||
|
||||
@@ -391,8 +394,6 @@ static int setup_pgtables_x86_64(struct
|
||||
l3off = l3_table_offset_x86_64(addr);
|
||||
l3tab[l3off] =
|
||||
pfn_to_paddr(xc_dom_p2m_guest(dom, l2pfn)) | L3_PROT;
|
||||
- if ( l3off == (L3_PAGETABLE_ENTRIES_X86_64 - 1) )
|
||||
- l3tab = NULL;
|
||||
l2pfn++;
|
||||
}
|
||||
|
||||
@@ -405,8 +406,6 @@ static int setup_pgtables_x86_64(struct
|
||||
l2off = l2_table_offset_x86_64(addr);
|
||||
l2tab[l2off] =
|
||||
pfn_to_paddr(xc_dom_p2m_guest(dom, l1pfn)) | L2_PROT;
|
||||
- if ( l2off == (L2_PAGETABLE_ENTRIES_X86_64 - 1) )
|
||||
- l2tab = NULL;
|
||||
l1pfn++;
|
||||
}
|
||||
|
||||
@@ -418,8 +417,17 @@ static int setup_pgtables_x86_64(struct
|
||||
if ( (addr >= dom->pgtables_seg.vstart) &&
|
||||
(addr < dom->pgtables_seg.vend) )
|
||||
l1tab[l1off] &= ~_PAGE_RW; /* page tables are r/o */
|
||||
+
|
||||
if ( l1off == (L1_PAGETABLE_ENTRIES_X86_64 - 1) )
|
||||
+ {
|
||||
l1tab = NULL;
|
||||
+ if ( l2off == (L2_PAGETABLE_ENTRIES_X86_64 - 1) )
|
||||
+ {
|
||||
+ l2tab = NULL;
|
||||
+ if ( l3off == (L3_PAGETABLE_ENTRIES_X86_64 - 1) )
|
||||
+ l3tab = NULL;
|
||||
+ }
|
||||
+ }
|
||||
}
|
||||
return 0;
|
||||
|
@ -1,30 +0,0 @@
|
||||
# Commit 803f9a6cdfeda64beee908576de0ad02d6b0c480
|
||||
# Date 2013-09-12 17:47:08 +0100
|
||||
# Author Tim Deegan <tim@xen.org>
|
||||
# Committer Tim Deegan <tim@xen.org>
|
||||
cpufreq: missing check of copy_from_guest()
|
||||
|
||||
Coverity CID 1055131
|
||||
Coverity CID 1055132
|
||||
|
||||
Signed-off-by: Tim Deegan <tim@xen.org>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/xen/drivers/cpufreq/cpufreq.c
|
||||
+++ b/xen/drivers/cpufreq/cpufreq.c
|
||||
@@ -471,8 +471,12 @@ int set_px_pminfo(uint32_t acpi_id, stru
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
- copy_from_guest(pxpt->states, dom0_px_info->states,
|
||||
- dom0_px_info->state_count);
|
||||
+ if ( copy_from_guest(pxpt->states, dom0_px_info->states,
|
||||
+ dom0_px_info->state_count) )
|
||||
+ {
|
||||
+ ret = -EFAULT;
|
||||
+ goto out;
|
||||
+ }
|
||||
pxpt->state_count = dom0_px_info->state_count;
|
||||
|
||||
if ( cpufreq_verbose )
|
@ -1,40 +0,0 @@
|
||||
# Commit a54dc5f4fe1eae6b1beb21326ef0338cd3969cd1
|
||||
# Date 2013-09-13 14:27:34 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: machine_restart() must not call acpi_dmar_reinstate() twice
|
||||
|
||||
.. as that function is not idempotent (it always alters the table
|
||||
checksum). The (generally) duplicate call was a result from it being
|
||||
made before machine_restart() re-invoking itself on the boot CPU.
|
||||
|
||||
Considering that no problem arose so far from the table corruption I
|
||||
doubt that we need to restore the correct table signature on the
|
||||
reboot path in general. The only case I can see this as potentially
|
||||
necessary is the tboot one, hence do the call just in that case.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/shutdown.c
|
||||
+++ b/xen/arch/x86/shutdown.c
|
||||
@@ -115,8 +115,6 @@ void machine_restart(unsigned int delay_
|
||||
console_start_sync();
|
||||
spin_debug_disable();
|
||||
|
||||
- acpi_dmar_reinstate();
|
||||
-
|
||||
local_irq_enable();
|
||||
|
||||
/* Ensure we are the boot CPU. */
|
||||
@@ -141,7 +139,10 @@ void machine_restart(unsigned int delay_
|
||||
mdelay(delay_millisecs);
|
||||
|
||||
if ( tboot_in_measured_env() )
|
||||
+ {
|
||||
+ acpi_dmar_reinstate();
|
||||
tboot_shutdown(TB_SHUTDOWN_REBOOT);
|
||||
+ }
|
||||
|
||||
efi_reset_system(reboot_mode != 0);
|
||||
|
@ -1,29 +0,0 @@
|
||||
# Commit 925fbcb7fdd6238f26b1576dc1f3e297f1f24f1e
|
||||
# Date 2013-09-18 14:45:24 +0200
|
||||
# Author George Dunlap <george.dunlap@eu.citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/HVM: fix failure path in hvm_vcpu_initialise
|
||||
|
||||
It looks like one of the failure cases in hvm_vcpu_initialise jumps to
|
||||
the wrong label; this could lead to slow leaks if something isn't
|
||||
cleaned up properly.
|
||||
|
||||
I will probably change these labels in a future patch, but I figured
|
||||
it was better to have this fix separately.
|
||||
|
||||
This is also a candidate for backport.
|
||||
|
||||
Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
|
||||
Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/hvm.c
|
||||
+++ b/xen/arch/x86/hvm/hvm.c
|
||||
@@ -1125,7 +1125,7 @@ int hvm_vcpu_initialise(struct vcpu *v)
|
||||
/* Create bufioreq event channel. */
|
||||
rc = alloc_unbound_xen_event_channel(v, dm_domid, NULL);
|
||||
if ( rc < 0 )
|
||||
- goto fail2;
|
||||
+ goto fail4;
|
||||
d->arch.hvm_domain.params[HVM_PARAM_BUFIOREQ_EVTCHN] = rc;
|
||||
}
|
||||
|
@ -1,26 +0,0 @@
|
||||
# Commit dad7e45bf44c0569546a3ed7d0fa4182a4a73f0a
|
||||
# Date 2013-09-18 14:45:42 +0200
|
||||
# Author George Dunlap <george.dunlap@eu.citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
VMX: fix failure path in construct_vmcs
|
||||
|
||||
If the allocation fails, make sure to call vmx_vmcs_exit().
|
||||
|
||||
This is a candidate for backport.
|
||||
|
||||
Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
|
||||
Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/vmx/vmcs.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
|
||||
@@ -894,7 +894,10 @@ static int construct_vmcs(struct vcpu *v
|
||||
unsigned long *msr_bitmap = alloc_xenheap_page();
|
||||
|
||||
if ( msr_bitmap == NULL )
|
||||
+ {
|
||||
+ vmx_vmcs_exit(v);
|
||||
return -ENOMEM;
|
||||
+ }
|
||||
|
||||
memset(msr_bitmap, ~0, PAGE_SIZE);
|
||||
v->arch.hvm_vmx.msr_bitmap = msr_bitmap;
|
@ -1,184 +0,0 @@
|
||||
# Commit 3b89f08a498ddac09d4002d9849e329018ceb107
|
||||
# Date 2013-09-20 11:01:08 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/HVM: properly handle MMIO reads and writes wider than a machine word
|
||||
|
||||
Just like real hardware we ought to split such accesses transparently
|
||||
to the caller. With little extra effort we can at once even handle page
|
||||
crossing accesses correctly.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/hvm/emulate.c
|
||||
+++ b/xen/arch/x86/hvm/emulate.c
|
||||
@@ -438,6 +438,7 @@ static int __hvmemul_read(
|
||||
{
|
||||
struct vcpu *curr = current;
|
||||
unsigned long addr, reps = 1;
|
||||
+ unsigned int off, chunk = min(bytes, 1U << LONG_BYTEORDER);
|
||||
uint32_t pfec = PFEC_page_present;
|
||||
struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
|
||||
paddr_t gpa;
|
||||
@@ -447,16 +448,38 @@ static int __hvmemul_read(
|
||||
seg, offset, bytes, &reps, access_type, hvmemul_ctxt, &addr);
|
||||
if ( rc != X86EMUL_OKAY )
|
||||
return rc;
|
||||
+ off = addr & (PAGE_SIZE - 1);
|
||||
+ /*
|
||||
+ * We only need to handle sizes actual instruction operands can have. All
|
||||
+ * such sizes are either powers of 2 or the sum of two powers of 2. Thus
|
||||
+ * picking as initial chunk size the largest power of 2 not greater than
|
||||
+ * the total size will always result in only power-of-2 size requests
|
||||
+ * issued to hvmemul_do_mmio() (hvmemul_do_io() rejects non-powers-of-2).
|
||||
+ */
|
||||
+ while ( chunk & (chunk - 1) )
|
||||
+ chunk &= chunk - 1;
|
||||
+ if ( off + bytes > PAGE_SIZE )
|
||||
+ while ( off & (chunk - 1) )
|
||||
+ chunk >>= 1;
|
||||
|
||||
if ( unlikely(vio->mmio_gva == (addr & PAGE_MASK)) && vio->mmio_gva )
|
||||
{
|
||||
- unsigned int off = addr & (PAGE_SIZE - 1);
|
||||
if ( access_type == hvm_access_insn_fetch )
|
||||
return X86EMUL_UNHANDLEABLE;
|
||||
gpa = (((paddr_t)vio->mmio_gpfn << PAGE_SHIFT) | off);
|
||||
- if ( (off + bytes) <= PAGE_SIZE )
|
||||
- return hvmemul_do_mmio(gpa, &reps, bytes, 0,
|
||||
- IOREQ_READ, 0, p_data);
|
||||
+ while ( (off + chunk) <= PAGE_SIZE )
|
||||
+ {
|
||||
+ rc = hvmemul_do_mmio(gpa, &reps, chunk, 0, IOREQ_READ, 0, p_data);
|
||||
+ if ( rc != X86EMUL_OKAY || bytes == chunk )
|
||||
+ return rc;
|
||||
+ addr += chunk;
|
||||
+ off += chunk;
|
||||
+ gpa += chunk;
|
||||
+ p_data += chunk;
|
||||
+ bytes -= chunk;
|
||||
+ if ( bytes < chunk )
|
||||
+ chunk = bytes;
|
||||
+ }
|
||||
}
|
||||
|
||||
if ( (seg != x86_seg_none) &&
|
||||
@@ -473,14 +496,32 @@ static int __hvmemul_read(
|
||||
return X86EMUL_EXCEPTION;
|
||||
case HVMCOPY_unhandleable:
|
||||
return X86EMUL_UNHANDLEABLE;
|
||||
- case HVMCOPY_bad_gfn_to_mfn:
|
||||
+ case HVMCOPY_bad_gfn_to_mfn:
|
||||
if ( access_type == hvm_access_insn_fetch )
|
||||
return X86EMUL_UNHANDLEABLE;
|
||||
- rc = hvmemul_linear_to_phys(
|
||||
- addr, &gpa, bytes, &reps, pfec, hvmemul_ctxt);
|
||||
- if ( rc != X86EMUL_OKAY )
|
||||
- return rc;
|
||||
- return hvmemul_do_mmio(gpa, &reps, bytes, 0, IOREQ_READ, 0, p_data);
|
||||
+ rc = hvmemul_linear_to_phys(addr, &gpa, chunk, &reps, pfec,
|
||||
+ hvmemul_ctxt);
|
||||
+ while ( rc == X86EMUL_OKAY )
|
||||
+ {
|
||||
+ rc = hvmemul_do_mmio(gpa, &reps, chunk, 0, IOREQ_READ, 0, p_data);
|
||||
+ if ( rc != X86EMUL_OKAY || bytes == chunk )
|
||||
+ break;
|
||||
+ addr += chunk;
|
||||
+ off += chunk;
|
||||
+ p_data += chunk;
|
||||
+ bytes -= chunk;
|
||||
+ if ( bytes < chunk )
|
||||
+ chunk = bytes;
|
||||
+ if ( off < PAGE_SIZE )
|
||||
+ gpa += chunk;
|
||||
+ else
|
||||
+ {
|
||||
+ rc = hvmemul_linear_to_phys(addr, &gpa, chunk, &reps, pfec,
|
||||
+ hvmemul_ctxt);
|
||||
+ off = 0;
|
||||
+ }
|
||||
+ }
|
||||
+ return rc;
|
||||
case HVMCOPY_gfn_paged_out:
|
||||
return X86EMUL_RETRY;
|
||||
case HVMCOPY_gfn_shared:
|
||||
@@ -537,6 +578,7 @@ static int hvmemul_write(
|
||||
container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
|
||||
struct vcpu *curr = current;
|
||||
unsigned long addr, reps = 1;
|
||||
+ unsigned int off, chunk = min(bytes, 1U << LONG_BYTEORDER);
|
||||
uint32_t pfec = PFEC_page_present | PFEC_write_access;
|
||||
struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
|
||||
paddr_t gpa;
|
||||
@@ -546,14 +588,30 @@ static int hvmemul_write(
|
||||
seg, offset, bytes, &reps, hvm_access_write, hvmemul_ctxt, &addr);
|
||||
if ( rc != X86EMUL_OKAY )
|
||||
return rc;
|
||||
+ off = addr & (PAGE_SIZE - 1);
|
||||
+ /* See the respective comment in __hvmemul_read(). */
|
||||
+ while ( chunk & (chunk - 1) )
|
||||
+ chunk &= chunk - 1;
|
||||
+ if ( off + bytes > PAGE_SIZE )
|
||||
+ while ( off & (chunk - 1) )
|
||||
+ chunk >>= 1;
|
||||
|
||||
if ( unlikely(vio->mmio_gva == (addr & PAGE_MASK)) && vio->mmio_gva )
|
||||
{
|
||||
- unsigned int off = addr & (PAGE_SIZE - 1);
|
||||
gpa = (((paddr_t)vio->mmio_gpfn << PAGE_SHIFT) | off);
|
||||
- if ( (off + bytes) <= PAGE_SIZE )
|
||||
- return hvmemul_do_mmio(gpa, &reps, bytes, 0,
|
||||
- IOREQ_WRITE, 0, p_data);
|
||||
+ while ( (off + chunk) <= PAGE_SIZE )
|
||||
+ {
|
||||
+ rc = hvmemul_do_mmio(gpa, &reps, chunk, 0, IOREQ_WRITE, 0, p_data);
|
||||
+ if ( rc != X86EMUL_OKAY || bytes == chunk )
|
||||
+ return rc;
|
||||
+ addr += chunk;
|
||||
+ off += chunk;
|
||||
+ gpa += chunk;
|
||||
+ p_data += chunk;
|
||||
+ bytes -= chunk;
|
||||
+ if ( bytes < chunk )
|
||||
+ chunk = bytes;
|
||||
+ }
|
||||
}
|
||||
|
||||
if ( (seg != x86_seg_none) &&
|
||||
@@ -569,12 +627,29 @@ static int hvmemul_write(
|
||||
case HVMCOPY_unhandleable:
|
||||
return X86EMUL_UNHANDLEABLE;
|
||||
case HVMCOPY_bad_gfn_to_mfn:
|
||||
- rc = hvmemul_linear_to_phys(
|
||||
- addr, &gpa, bytes, &reps, pfec, hvmemul_ctxt);
|
||||
- if ( rc != X86EMUL_OKAY )
|
||||
- return rc;
|
||||
- return hvmemul_do_mmio(gpa, &reps, bytes, 0,
|
||||
- IOREQ_WRITE, 0, p_data);
|
||||
+ rc = hvmemul_linear_to_phys(addr, &gpa, chunk, &reps, pfec,
|
||||
+ hvmemul_ctxt);
|
||||
+ while ( rc == X86EMUL_OKAY )
|
||||
+ {
|
||||
+ rc = hvmemul_do_mmio(gpa, &reps, chunk, 0, IOREQ_WRITE, 0, p_data);
|
||||
+ if ( rc != X86EMUL_OKAY || bytes == chunk )
|
||||
+ break;
|
||||
+ addr += chunk;
|
||||
+ off += chunk;
|
||||
+ p_data += chunk;
|
||||
+ bytes -= chunk;
|
||||
+ if ( bytes < chunk )
|
||||
+ chunk = bytes;
|
||||
+ if ( off < PAGE_SIZE )
|
||||
+ gpa += chunk;
|
||||
+ else
|
||||
+ {
|
||||
+ rc = hvmemul_linear_to_phys(addr, &gpa, chunk, &reps, pfec,
|
||||
+ hvmemul_ctxt);
|
||||
+ off = 0;
|
||||
+ }
|
||||
+ }
|
||||
+ return rc;
|
||||
case HVMCOPY_gfn_paged_out:
|
||||
return X86EMUL_RETRY;
|
||||
case HVMCOPY_gfn_shared:
|
@ -1,155 +0,0 @@
|
||||
# Commit 5e5a44b6c942d6ea47f15d6f1ed02b03e0d69445
|
||||
# Date 2013-09-20 11:37:28 +0200
|
||||
# Author Dario Faggioli <dario.faggioli@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
sched_credit: filter node-affinity mask against online cpus
|
||||
|
||||
in _csched_cpu_pick(), as not doing so may result in the domain's
|
||||
node-affinity mask (as retrieved by csched_balance_cpumask() )
|
||||
and online mask (as retrieved by cpupool_scheduler_cpumask() )
|
||||
having an empty intersection.
|
||||
|
||||
Therefore, when attempting a node-affinity load balancing step
|
||||
and running this:
|
||||
|
||||
...
|
||||
/* Pick an online CPU from the proper affinity mask */
|
||||
csched_balance_cpumask(vc, balance_step, &cpus);
|
||||
cpumask_and(&cpus, &cpus, online);
|
||||
...
|
||||
|
||||
we end up with an empty cpumask (in cpus). At this point, in
|
||||
the following code:
|
||||
|
||||
....
|
||||
/* If present, prefer vc's current processor */
|
||||
cpu = cpumask_test_cpu(vc->processor, &cpus)
|
||||
? vc->processor
|
||||
: cpumask_cycle(vc->processor, &cpus);
|
||||
....
|
||||
|
||||
an ASSERT (from inside cpumask_cycle() ) triggers like this:
|
||||
|
||||
(XEN) Xen call trace:
|
||||
(XEN) [<ffff82d08011b124>] _csched_cpu_pick+0x1d2/0x652
|
||||
(XEN) [<ffff82d08011b5b2>] csched_cpu_pick+0xe/0x10
|
||||
(XEN) [<ffff82d0801232de>] vcpu_migrate+0x167/0x31e
|
||||
(XEN) [<ffff82d0801238cc>] cpu_disable_scheduler+0x1c8/0x287
|
||||
(XEN) [<ffff82d080101b3f>] cpupool_unassign_cpu_helper+0x20/0xb4
|
||||
(XEN) [<ffff82d08010544f>] continue_hypercall_tasklet_handler+0x4a/0xb1
|
||||
(XEN) [<ffff82d080127793>] do_tasklet_work+0x78/0xab
|
||||
(XEN) [<ffff82d080127a70>] do_tasklet+0x5f/0x8b
|
||||
(XEN) [<ffff82d080158985>] idle_loop+0x57/0x5e
|
||||
(XEN)
|
||||
(XEN)
|
||||
(XEN) ****************************************
|
||||
(XEN) Panic on CPU 1:
|
||||
(XEN) Assertion 'cpu < nr_cpu_ids' failed at /home/dario/Sources/xen/xen/xen.git/xen/include/xe:16481
|
||||
|
||||
It is for example sufficient to have a domain with node-affinity
|
||||
to NUMA node 1 running, and issueing a `xl cpupool-numa-split'
|
||||
would make the above happen. That is because, by default, all
|
||||
the existing domains remain assigned to the first cpupool, and
|
||||
it now (after the cpupool-numa-split) only includes NUMA node 0.
|
||||
|
||||
This change prevents that by generalizing the function used
|
||||
for figuring out whether a node-affinity load balancing step
|
||||
is legit or not. This way we can, in _csched_cpu_pick(),
|
||||
figure out early enough that the mask would end up empty,
|
||||
skip the step all together and avoid the splat.
|
||||
|
||||
Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
|
||||
Reviewed-by: George Dunlap <george.dunlap@eu.citrix.com>
|
||||
|
||||
--- a/xen/common/sched_credit.c
|
||||
+++ b/xen/common/sched_credit.c
|
||||
@@ -296,15 +296,28 @@ static void csched_set_node_affinity(
|
||||
* vcpu-affinity balancing is always necessary and must never be skipped.
|
||||
* OTOH, if a domain's node-affinity is said to be automatically computed
|
||||
* (or if it just spans all the nodes), we can safely avoid dealing with
|
||||
- * node-affinity entirely. Ah, node-affinity is also deemed meaningless
|
||||
- * in case it has empty intersection with the vcpu's vcpu-affinity, as it
|
||||
- * would mean trying to schedule it on _no_ pcpu!
|
||||
+ * node-affinity entirely.
|
||||
+ *
|
||||
+ * Node-affinity is also deemed meaningless in case it has empty
|
||||
+ * intersection with mask, to cover the cases where using the node-affinity
|
||||
+ * mask seems legit, but would instead led to trying to schedule the vcpu
|
||||
+ * on _no_ pcpu! Typical use cases are for mask to be equal to the vcpu's
|
||||
+ * vcpu-affinity, or to the && of vcpu-affinity and the set of online cpus
|
||||
+ * in the domain's cpupool.
|
||||
*/
|
||||
-#define __vcpu_has_node_affinity(vc) \
|
||||
- ( !(cpumask_full(CSCHED_DOM(vc->domain)->node_affinity_cpumask) \
|
||||
- || !cpumask_intersects(vc->cpu_affinity, \
|
||||
- CSCHED_DOM(vc->domain)->node_affinity_cpumask) \
|
||||
- || vc->domain->auto_node_affinity == 1) )
|
||||
+static inline int __vcpu_has_node_affinity(const struct vcpu *vc,
|
||||
+ const cpumask_t *mask)
|
||||
+{
|
||||
+ const struct domain *d = vc->domain;
|
||||
+ const struct csched_dom *sdom = CSCHED_DOM(d);
|
||||
+
|
||||
+ if ( d->auto_node_affinity
|
||||
+ || cpumask_full(sdom->node_affinity_cpumask)
|
||||
+ || !cpumask_intersects(sdom->node_affinity_cpumask, mask) )
|
||||
+ return 0;
|
||||
+
|
||||
+ return 1;
|
||||
+}
|
||||
|
||||
/*
|
||||
* Each csched-balance step uses its own cpumask. This function determines
|
||||
@@ -393,7 +406,8 @@ __runq_tickle(unsigned int cpu, struct c
|
||||
int new_idlers_empty;
|
||||
|
||||
if ( balance_step == CSCHED_BALANCE_NODE_AFFINITY
|
||||
- && !__vcpu_has_node_affinity(new->vcpu) )
|
||||
+ && !__vcpu_has_node_affinity(new->vcpu,
|
||||
+ new->vcpu->cpu_affinity) )
|
||||
continue;
|
||||
|
||||
/* Are there idlers suitable for new (for this balance step)? */
|
||||
@@ -626,11 +640,32 @@ _csched_cpu_pick(const struct scheduler
|
||||
int cpu = vc->processor;
|
||||
int balance_step;
|
||||
|
||||
+ /* Store in cpus the mask of online cpus on which the domain can run */
|
||||
online = cpupool_scheduler_cpumask(vc->domain->cpupool);
|
||||
+ cpumask_and(&cpus, vc->cpu_affinity, online);
|
||||
+
|
||||
for_each_csched_balance_step( balance_step )
|
||||
{
|
||||
+ /*
|
||||
+ * We want to pick up a pcpu among the ones that are online and
|
||||
+ * can accommodate vc, which is basically what we computed above
|
||||
+ * and stored in cpus. As far as vcpu-affinity is concerned,
|
||||
+ * there always will be at least one of these pcpus, hence cpus
|
||||
+ * is never empty and the calls to cpumask_cycle() and
|
||||
+ * cpumask_test_cpu() below are ok.
|
||||
+ *
|
||||
+ * On the other hand, when considering node-affinity too, it
|
||||
+ * is possible for the mask to become empty (for instance, if the
|
||||
+ * domain has been put in a cpupool that does not contain any of the
|
||||
+ * nodes in its node-affinity), which would result in the ASSERT()-s
|
||||
+ * inside cpumask_*() operations triggering (in debug builds).
|
||||
+ *
|
||||
+ * Therefore, in this case, we filter the node-affinity mask against
|
||||
+ * cpus and, if the result is empty, we just skip the node-affinity
|
||||
+ * balancing step all together.
|
||||
+ */
|
||||
if ( balance_step == CSCHED_BALANCE_NODE_AFFINITY
|
||||
- && !__vcpu_has_node_affinity(vc) )
|
||||
+ && !__vcpu_has_node_affinity(vc, &cpus) )
|
||||
continue;
|
||||
|
||||
/* Pick an online CPU from the proper affinity mask */
|
||||
@@ -1449,7 +1484,7 @@ csched_runq_steal(int peer_cpu, int cpu,
|
||||
* or counter.
|
||||
*/
|
||||
if ( balance_step == CSCHED_BALANCE_NODE_AFFINITY
|
||||
- && !__vcpu_has_node_affinity(vc) )
|
||||
+ && !__vcpu_has_node_affinity(vc, vc->cpu_affinity) )
|
||||
continue;
|
||||
|
||||
csched_balance_cpumask(vc, balance_step, csched_balance_mask);
|
@ -1,48 +0,0 @@
|
||||
# Commit df17e9c889c48c9c10aa3f9dd0bb11077f54efc4
|
||||
# Date 2013-09-20 11:41:08 +0200
|
||||
# Author Olaf Hering <olaf@aepfle.de>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
unmodified_drivers: enable unplug per default
|
||||
|
||||
Since xen-3.3 an official unplug protocol for emulated hardware is
|
||||
available in the toolstack. The pvops kernel does the unplug per
|
||||
default, so it is safe to do it also in the drivers for forward ported
|
||||
xenlinux.
|
||||
Currently its required to load xen-platform-pci with the module
|
||||
parameter dev_unplug=all, which is cumbersome.
|
||||
Also recognize the dev_unplug=never parameter, which provides the
|
||||
default before this patch.
|
||||
|
||||
Signed-off-by: Olaf Hering <olaf@aepfle.de>
|
||||
|
||||
--- a/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c
|
||||
+++ b/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c
|
||||
@@ -66,7 +66,7 @@ MODULE_LICENSE("GPL");
|
||||
static char *dev_unplug;
|
||||
module_param(dev_unplug, charp, 0644);
|
||||
MODULE_PARM_DESC(dev_unplug, "Emulated devices to unplug: "
|
||||
- "[all,][ide-disks,][aux-ide-disks,][nics]\n");
|
||||
+ "[all,][ide-disks,][aux-ide-disks,][nics,][never] (default is 'all')\n");
|
||||
|
||||
struct pci_dev *xen_platform_pdev;
|
||||
|
||||
@@ -290,6 +290,10 @@ static int check_platform_magic(struct d
|
||||
short magic, unplug = 0;
|
||||
char protocol, *p, *q, *err;
|
||||
|
||||
+ /* Unconditionally unplug everything */
|
||||
+ if (!dev_unplug)
|
||||
+ unplug = UNPLUG_ALL;
|
||||
+
|
||||
for (p = dev_unplug; p; p = q) {
|
||||
q = strchr(dev_unplug, ',');
|
||||
if (q)
|
||||
@@ -302,6 +306,8 @@ static int check_platform_magic(struct d
|
||||
unplug |= UNPLUG_AUX_IDE_DISKS;
|
||||
else if (!strcmp(p, "nics"))
|
||||
unplug |= UNPLUG_ALL_NICS;
|
||||
+ else if (!strcmp(p, "never"))
|
||||
+ unplug = 0;
|
||||
else
|
||||
dev_warn(dev, "unrecognised option '%s' "
|
||||
"in module parameter 'dev_unplug'\n", p);
|
@ -1,92 +0,0 @@
|
||||
# Commit 7f12732670b31b2fea899a4160d455574658474f
|
||||
# Date 2013-09-23 09:53:55 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/HVM: linear address must be canonical for the whole accessed range
|
||||
|
||||
... rather than just for the first byte.
|
||||
|
||||
While at it, also
|
||||
- make the real mode case at least dpo a wrap around check
|
||||
- drop the mis-named "gpf" label (we're not generating faults here)
|
||||
and use in-place returns instead
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/hvm/hvm.c
|
||||
+++ b/xen/arch/x86/hvm/hvm.c
|
||||
@@ -1938,8 +1938,7 @@ int hvm_virtual_to_linear_addr(
|
||||
unsigned int addr_size,
|
||||
unsigned long *linear_addr)
|
||||
{
|
||||
- unsigned long addr = offset;
|
||||
- uint32_t last_byte;
|
||||
+ unsigned long addr = offset, last_byte;
|
||||
|
||||
if ( !(current->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) )
|
||||
{
|
||||
@@ -1948,6 +1947,9 @@ int hvm_virtual_to_linear_addr(
|
||||
* Certain of them are not done in native real mode anyway.
|
||||
*/
|
||||
addr = (uint32_t)(addr + reg->base);
|
||||
+ last_byte = (uint32_t)addr + bytes - 1;
|
||||
+ if ( last_byte < addr )
|
||||
+ return 0;
|
||||
}
|
||||
else if ( addr_size != 64 )
|
||||
{
|
||||
@@ -1959,17 +1961,17 @@ int hvm_virtual_to_linear_addr(
|
||||
{
|
||||
case hvm_access_read:
|
||||
if ( (reg->attr.fields.type & 0xa) == 0x8 )
|
||||
- goto gpf; /* execute-only code segment */
|
||||
+ return 0; /* execute-only code segment */
|
||||
break;
|
||||
case hvm_access_write:
|
||||
if ( (reg->attr.fields.type & 0xa) != 0x2 )
|
||||
- goto gpf; /* not a writable data segment */
|
||||
+ return 0; /* not a writable data segment */
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
- last_byte = offset + bytes - 1;
|
||||
+ last_byte = (uint32_t)offset + bytes - 1;
|
||||
|
||||
/* Is this a grows-down data segment? Special limit check if so. */
|
||||
if ( (reg->attr.fields.type & 0xc) == 0x4 )
|
||||
@@ -1980,10 +1982,10 @@ int hvm_virtual_to_linear_addr(
|
||||
|
||||
/* Check first byte and last byte against respective bounds. */
|
||||
if ( (offset <= reg->limit) || (last_byte < offset) )
|
||||
- goto gpf;
|
||||
+ return 0;
|
||||
}
|
||||
else if ( (last_byte > reg->limit) || (last_byte < offset) )
|
||||
- goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */
|
||||
+ return 0; /* last byte is beyond limit or wraps 0xFFFFFFFF */
|
||||
|
||||
/*
|
||||
* Hardware truncates to 32 bits in compatibility mode.
|
||||
@@ -2000,15 +2002,14 @@ int hvm_virtual_to_linear_addr(
|
||||
if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) )
|
||||
addr += reg->base;
|
||||
|
||||
- if ( !is_canonical_address(addr) )
|
||||
- goto gpf;
|
||||
+ last_byte = addr + bytes - 1;
|
||||
+ if ( !is_canonical_address(addr) || last_byte < addr ||
|
||||
+ !is_canonical_address(last_byte) )
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
*linear_addr = addr;
|
||||
return 1;
|
||||
-
|
||||
- gpf:
|
||||
- return 0;
|
||||
}
|
||||
|
||||
/* On non-NULL return, we leave this function holding an additional
|
@ -1,65 +0,0 @@
|
||||
# Commit 14fcce2fa883405bab26b60821a6cc5f2c770833
|
||||
# Date 2013-09-23 09:55:14 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/HVM: refuse doing string operations in certain situations
|
||||
|
||||
We shouldn't do any acceleration for
|
||||
- "rep movs" when either side is passed through MMIO or when both sides
|
||||
are handled by qemu
|
||||
- "rep ins" and "rep outs" when the memory operand is any kind of MMIO
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/hvm/emulate.c
|
||||
+++ b/xen/arch/x86/hvm/emulate.c
|
||||
@@ -686,6 +686,7 @@ static int hvmemul_rep_ins(
|
||||
unsigned long addr;
|
||||
uint32_t pfec = PFEC_page_present | PFEC_write_access;
|
||||
paddr_t gpa;
|
||||
+ p2m_type_t p2mt;
|
||||
int rc;
|
||||
|
||||
rc = hvmemul_virtual_to_linear(
|
||||
@@ -702,6 +703,10 @@ static int hvmemul_rep_ins(
|
||||
if ( rc != X86EMUL_OKAY )
|
||||
return rc;
|
||||
|
||||
+ (void) get_gfn_query_unlocked(current->domain, gpa >> PAGE_SHIFT, &p2mt);
|
||||
+ if ( p2mt == p2m_mmio_direct || p2mt == p2m_mmio_dm )
|
||||
+ return X86EMUL_UNHANDLEABLE;
|
||||
+
|
||||
return hvmemul_do_pio(src_port, reps, bytes_per_rep, gpa, IOREQ_READ,
|
||||
!!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
|
||||
}
|
||||
@@ -719,6 +724,7 @@ static int hvmemul_rep_outs(
|
||||
unsigned long addr;
|
||||
uint32_t pfec = PFEC_page_present;
|
||||
paddr_t gpa;
|
||||
+ p2m_type_t p2mt;
|
||||
int rc;
|
||||
|
||||
rc = hvmemul_virtual_to_linear(
|
||||
@@ -735,6 +741,10 @@ static int hvmemul_rep_outs(
|
||||
if ( rc != X86EMUL_OKAY )
|
||||
return rc;
|
||||
|
||||
+ (void) get_gfn_query_unlocked(current->domain, gpa >> PAGE_SHIFT, &p2mt);
|
||||
+ if ( p2mt == p2m_mmio_direct || p2mt == p2m_mmio_dm )
|
||||
+ return X86EMUL_UNHANDLEABLE;
|
||||
+
|
||||
return hvmemul_do_pio(dst_port, reps, bytes_per_rep, gpa, IOREQ_WRITE,
|
||||
!!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
|
||||
}
|
||||
@@ -787,6 +797,10 @@ static int hvmemul_rep_movs(
|
||||
(void) get_gfn_query_unlocked(current->domain, sgpa >> PAGE_SHIFT, &sp2mt);
|
||||
(void) get_gfn_query_unlocked(current->domain, dgpa >> PAGE_SHIFT, &dp2mt);
|
||||
|
||||
+ if ( sp2mt == p2m_mmio_direct || dp2mt == p2m_mmio_direct ||
|
||||
+ (sp2mt == p2m_mmio_dm && dp2mt == p2m_mmio_dm) )
|
||||
+ return X86EMUL_UNHANDLEABLE;
|
||||
+
|
||||
if ( sp2mt == p2m_mmio_dm )
|
||||
return hvmemul_do_mmio(
|
||||
sgpa, reps, bytes_per_rep, dgpa, IOREQ_READ, df, NULL);
|
@ -1,52 +0,0 @@
|
||||
References: bnc#839596 CVE-2013-1442 XSA-62
|
||||
|
||||
# Commit 63a75ba0de817d6f384f96d25427a05c313e2179
|
||||
# Date 2013-09-25 10:41:25 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/xsave: initialize extended register state when guests enable it
|
||||
|
||||
Till now, when setting previously unset bits in XCR0 we wouldn't touch
|
||||
the active register state, thus leaving in the newly enabled registers
|
||||
whatever a prior user of it left there, i.e. potentially leaking
|
||||
information between guests.
|
||||
|
||||
This is CVE-2013-1442 / XSA-62.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/xstate.c
|
||||
+++ b/xen/arch/x86/xstate.c
|
||||
@@ -342,6 +342,7 @@ int validate_xstate(u64 xcr0, u64 xcr0_a
|
||||
int handle_xsetbv(u32 index, u64 new_bv)
|
||||
{
|
||||
struct vcpu *curr = current;
|
||||
+ u64 mask;
|
||||
|
||||
if ( index != XCR_XFEATURE_ENABLED_MASK )
|
||||
return -EOPNOTSUPP;
|
||||
@@ -355,9 +356,23 @@ int handle_xsetbv(u32 index, u64 new_bv)
|
||||
if ( !set_xcr0(new_bv) )
|
||||
return -EFAULT;
|
||||
|
||||
+ mask = new_bv & ~curr->arch.xcr0_accum;
|
||||
curr->arch.xcr0 = new_bv;
|
||||
curr->arch.xcr0_accum |= new_bv;
|
||||
|
||||
+ mask &= curr->fpu_dirtied ? ~XSTATE_FP_SSE : XSTATE_NONLAZY;
|
||||
+ if ( mask )
|
||||
+ {
|
||||
+ unsigned long cr0 = read_cr0();
|
||||
+
|
||||
+ clts();
|
||||
+ if ( curr->fpu_dirtied )
|
||||
+ asm ( "stmxcsr %0" : "=m" (curr->arch.xsave_area->fpu_sse.mxcsr) );
|
||||
+ xrstor(curr, mask);
|
||||
+ if ( cr0 & X86_CR0_TS )
|
||||
+ write_cr0(cr0);
|
||||
+ }
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
|
@ -1,177 +0,0 @@
|
||||
References: bnc#840592 CVE-2013-4355 XSA-63
|
||||
|
||||
# Commit 6bb838e7375f5b031e9ac346b353775c90de45dc
|
||||
# Date 2013-09-30 14:17:46 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: properly handle hvm_copy_from_guest_{phys,virt}() errors
|
||||
|
||||
Ignoring them generally implies using uninitialized data and, in all
|
||||
but two of the cases dealt with here, potentially leaking hypervisor
|
||||
stack contents to guests.
|
||||
|
||||
This is CVE-2013-4355 / XSA-63.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Tim Deegan <tim@xen.org>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/hvm.c
|
||||
+++ b/xen/arch/x86/hvm/hvm.c
|
||||
@@ -2316,11 +2316,7 @@ void hvm_task_switch(
|
||||
|
||||
rc = hvm_copy_from_guest_virt(
|
||||
&tss, prev_tr.base, sizeof(tss), PFEC_page_present);
|
||||
- if ( rc == HVMCOPY_bad_gva_to_gfn )
|
||||
- goto out;
|
||||
- if ( rc == HVMCOPY_gfn_paged_out )
|
||||
- goto out;
|
||||
- if ( rc == HVMCOPY_gfn_shared )
|
||||
+ if ( rc != HVMCOPY_okay )
|
||||
goto out;
|
||||
|
||||
eflags = regs->eflags;
|
||||
@@ -2365,13 +2361,11 @@ void hvm_task_switch(
|
||||
|
||||
rc = hvm_copy_from_guest_virt(
|
||||
&tss, tr.base, sizeof(tss), PFEC_page_present);
|
||||
- if ( rc == HVMCOPY_bad_gva_to_gfn )
|
||||
- goto out;
|
||||
- if ( rc == HVMCOPY_gfn_paged_out )
|
||||
- goto out;
|
||||
- /* Note: this could be optimised, if the callee functions knew we want RO
|
||||
- * access */
|
||||
- if ( rc == HVMCOPY_gfn_shared )
|
||||
+ /*
|
||||
+ * Note: The HVMCOPY_gfn_shared case could be optimised, if the callee
|
||||
+ * functions knew we want RO access.
|
||||
+ */
|
||||
+ if ( rc != HVMCOPY_okay )
|
||||
goto out;
|
||||
|
||||
|
||||
--- a/xen/arch/x86/hvm/intercept.c
|
||||
+++ b/xen/arch/x86/hvm/intercept.c
|
||||
@@ -87,17 +87,28 @@ static int hvm_mmio_access(struct vcpu *
|
||||
{
|
||||
for ( i = 0; i < p->count; i++ )
|
||||
{
|
||||
- int ret;
|
||||
-
|
||||
- ret = hvm_copy_from_guest_phys(&data,
|
||||
- p->data + (sign * i * p->size),
|
||||
- p->size);
|
||||
- if ( (ret == HVMCOPY_gfn_paged_out) ||
|
||||
- (ret == HVMCOPY_gfn_shared) )
|
||||
+ switch ( hvm_copy_from_guest_phys(&data,
|
||||
+ p->data + sign * i * p->size,
|
||||
+ p->size) )
|
||||
{
|
||||
+ case HVMCOPY_okay:
|
||||
+ break;
|
||||
+ case HVMCOPY_gfn_paged_out:
|
||||
+ case HVMCOPY_gfn_shared:
|
||||
rc = X86EMUL_RETRY;
|
||||
break;
|
||||
+ case HVMCOPY_bad_gfn_to_mfn:
|
||||
+ data = ~0;
|
||||
+ break;
|
||||
+ case HVMCOPY_bad_gva_to_gfn:
|
||||
+ ASSERT(0);
|
||||
+ /* fall through */
|
||||
+ default:
|
||||
+ rc = X86EMUL_UNHANDLEABLE;
|
||||
+ break;
|
||||
}
|
||||
+ if ( rc != X86EMUL_OKAY )
|
||||
+ break;
|
||||
rc = write_handler(v, p->addr + (sign * i * p->size), p->size,
|
||||
data);
|
||||
if ( rc != X86EMUL_OKAY )
|
||||
@@ -165,8 +176,28 @@ static int process_portio_intercept(port
|
||||
for ( i = 0; i < p->count; i++ )
|
||||
{
|
||||
data = 0;
|
||||
- (void)hvm_copy_from_guest_phys(&data, p->data + sign*i*p->size,
|
||||
- p->size);
|
||||
+ switch ( hvm_copy_from_guest_phys(&data,
|
||||
+ p->data + sign * i * p->size,
|
||||
+ p->size) )
|
||||
+ {
|
||||
+ case HVMCOPY_okay:
|
||||
+ break;
|
||||
+ case HVMCOPY_gfn_paged_out:
|
||||
+ case HVMCOPY_gfn_shared:
|
||||
+ rc = X86EMUL_RETRY;
|
||||
+ break;
|
||||
+ case HVMCOPY_bad_gfn_to_mfn:
|
||||
+ data = ~0;
|
||||
+ break;
|
||||
+ case HVMCOPY_bad_gva_to_gfn:
|
||||
+ ASSERT(0);
|
||||
+ /* fall through */
|
||||
+ default:
|
||||
+ rc = X86EMUL_UNHANDLEABLE;
|
||||
+ break;
|
||||
+ }
|
||||
+ if ( rc != X86EMUL_OKAY )
|
||||
+ break;
|
||||
rc = action(IOREQ_WRITE, p->addr, p->size, &data);
|
||||
if ( rc != X86EMUL_OKAY )
|
||||
break;
|
||||
--- a/xen/arch/x86/hvm/io.c
|
||||
+++ b/xen/arch/x86/hvm/io.c
|
||||
@@ -340,14 +340,24 @@ static int dpci_ioport_write(uint32_t mp
|
||||
data = p->data;
|
||||
if ( p->data_is_ptr )
|
||||
{
|
||||
- int ret;
|
||||
-
|
||||
- ret = hvm_copy_from_guest_phys(&data,
|
||||
- p->data + (sign * i * p->size),
|
||||
- p->size);
|
||||
- if ( (ret == HVMCOPY_gfn_paged_out) &&
|
||||
- (ret == HVMCOPY_gfn_shared) )
|
||||
+ switch ( hvm_copy_from_guest_phys(&data,
|
||||
+ p->data + sign * i * p->size,
|
||||
+ p->size) )
|
||||
+ {
|
||||
+ case HVMCOPY_okay:
|
||||
+ break;
|
||||
+ case HVMCOPY_gfn_paged_out:
|
||||
+ case HVMCOPY_gfn_shared:
|
||||
return X86EMUL_RETRY;
|
||||
+ case HVMCOPY_bad_gfn_to_mfn:
|
||||
+ data = ~0;
|
||||
+ break;
|
||||
+ case HVMCOPY_bad_gva_to_gfn:
|
||||
+ ASSERT(0);
|
||||
+ /* fall through */
|
||||
+ default:
|
||||
+ return X86EMUL_UNHANDLEABLE;
|
||||
+ }
|
||||
}
|
||||
|
||||
switch ( p->size )
|
||||
--- a/xen/arch/x86/hvm/vmx/realmode.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/realmode.c
|
||||
@@ -39,7 +39,9 @@ static void realmode_deliver_exception(
|
||||
|
||||
again:
|
||||
last_byte = (vector * 4) + 3;
|
||||
- if ( idtr->limit < last_byte )
|
||||
+ if ( idtr->limit < last_byte ||
|
||||
+ hvm_copy_from_guest_phys(&cs_eip, idtr->base + vector * 4, 4) !=
|
||||
+ HVMCOPY_okay )
|
||||
{
|
||||
/* Software interrupt? */
|
||||
if ( insn_len != 0 )
|
||||
@@ -64,8 +66,6 @@ static void realmode_deliver_exception(
|
||||
}
|
||||
}
|
||||
|
||||
- (void)hvm_copy_from_guest_phys(&cs_eip, idtr->base + vector * 4, 4);
|
||||
-
|
||||
frame[0] = regs->eip + insn_len;
|
||||
frame[1] = csr->sel;
|
||||
frame[2] = regs->eflags & ~X86_EFLAGS_RF;
|
@ -1,56 +0,0 @@
|
||||
References: bnc#840593 CVE-2013-4356 XSA-64
|
||||
|
||||
# Commit f46befdd825c8a459c5eb21adb7d5b0dc6e30ad5
|
||||
# Date 2013-09-30 14:18:25 +0200
|
||||
# Author Tim Deegan <tim@xen.org>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/mm/shadow: Fix initialization of PV shadow L4 tables.
|
||||
|
||||
Shadowed PV L4 tables must have the same Xen mappings as their
|
||||
unshadowed equivalent. This is done by copying the Xen entries
|
||||
verbatim from the idle pagetable, and then using guest_l4_slot()
|
||||
in the SHADOW_FOREACH_L4E() iterator to avoid touching those entries.
|
||||
|
||||
adc5afbf1c70ef55c260fb93e4b8ce5ccb918706 (x86: support up to 16Tb)
|
||||
changed the definition of ROOT_PAGETABLE_XEN_SLOTS to extend right to
|
||||
the top of the address space, which causes the shadow code to
|
||||
copy Xen mappings into guest-kernel-address slots too.
|
||||
|
||||
In the common case, all those slots are zero in the idle pagetable,
|
||||
and no harm is done. But if any slot above #271 is non-zero, Xen will
|
||||
crash when that slot is later cleared (it attempts to drop
|
||||
shadow-pagetable refcounts on its own L4 pagetables).
|
||||
|
||||
Fix by using the new ROOT_PAGETABLE_PV_XEN_SLOTS when appropriate.
|
||||
Monitor pagetables need the full Xen mappings, so they keep using the
|
||||
old name (with its new semantics).
|
||||
|
||||
This is CVE-2013-4356 / XSA-64.
|
||||
|
||||
Signed-off-by: Tim Deegan <tim@xen.org>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/xen/arch/x86/mm/shadow/multi.c
|
||||
+++ b/xen/arch/x86/mm/shadow/multi.c
|
||||
@@ -1433,15 +1433,19 @@ void sh_install_xen_entries_in_l4(struct
|
||||
{
|
||||
struct domain *d = v->domain;
|
||||
shadow_l4e_t *sl4e;
|
||||
+ unsigned int slots;
|
||||
|
||||
sl4e = sh_map_domain_page(sl4mfn);
|
||||
ASSERT(sl4e != NULL);
|
||||
ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
|
||||
-
|
||||
+
|
||||
/* Copy the common Xen mappings from the idle domain */
|
||||
+ slots = (shadow_mode_external(d)
|
||||
+ ? ROOT_PAGETABLE_XEN_SLOTS
|
||||
+ : ROOT_PAGETABLE_PV_XEN_SLOTS);
|
||||
memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
|
||||
&idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
|
||||
- ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
|
||||
+ slots * sizeof(l4_pgentry_t));
|
||||
|
||||
/* Install the per-domain mappings for this domain */
|
||||
sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
|
@ -1,29 +0,0 @@
|
||||
References: bnc#841766 CVE-2013-4361 XSA-66
|
||||
|
||||
# Commit 28b706efb6abb637fabfd74cde70a50935a5640b
|
||||
# Date 2013-09-30 14:18:58 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: properly set up fbld emulation operand address
|
||||
|
||||
This is CVE-2013-4361 / XSA-66.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Ian Jackson <ian.jackson@eu.citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
|
||||
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
|
||||
@@ -3156,11 +3156,11 @@ x86_emulate(
|
||||
break;
|
||||
case 4: /* fbld m80dec */
|
||||
ea.bytes = 10;
|
||||
- dst = ea;
|
||||
+ src = ea;
|
||||
if ( (rc = ops->read(src.mem.seg, src.mem.off,
|
||||
&src.val, src.bytes, ctxt)) != 0 )
|
||||
goto done;
|
||||
- emulate_fpu_insn_memdst("fbld", src.val);
|
||||
+ emulate_fpu_insn_memsrc("fbld", src.val);
|
||||
break;
|
||||
case 5: /* fild m64i */
|
||||
ea.bytes = 8;
|
@ -1,116 +0,0 @@
|
||||
# Commit ca145fe70bad3a25ad54c6ded1ef237e45a2311e
|
||||
# Date 2013-09-30 15:28:12 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: don't blindly create L3 tables for the direct map
|
||||
|
||||
Now that the direct map area can extend all the way up to almost the
|
||||
end of address space, this is wasteful.
|
||||
|
||||
Also fold two almost redundant messages in SRAT parsing into one.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Tested-by: Malcolm Crossley <malcolm.crossley@citrix.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -137,7 +137,7 @@ l1_pgentry_t __attribute__ ((__section__
|
||||
#define PTE_UPDATE_WITH_CMPXCHG
|
||||
#endif
|
||||
|
||||
-bool_t __read_mostly mem_hotplug = 0;
|
||||
+paddr_t __read_mostly mem_hotplug;
|
||||
|
||||
/* Private domain structs for DOMID_XEN and DOMID_IO. */
|
||||
struct domain *dom_xen, *dom_io, *dom_cow;
|
||||
--- a/xen/arch/x86/srat.c
|
||||
+++ b/xen/arch/x86/srat.c
|
||||
@@ -113,6 +113,7 @@ static __init void bad_srat(void)
|
||||
apicid_to_node[i] = NUMA_NO_NODE;
|
||||
for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
|
||||
pxm2node[i] = NUMA_NO_NODE;
|
||||
+ mem_hotplug = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -257,13 +258,6 @@ acpi_numa_memory_affinity_init(struct ac
|
||||
return;
|
||||
}
|
||||
/* It is fine to add this area to the nodes data it will be used later*/
|
||||
- if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)
|
||||
- {
|
||||
- printk(KERN_INFO "SRAT: hot plug zone found %"PRIx64" - %"PRIx64" \n",
|
||||
- start, end);
|
||||
- mem_hotplug = 1;
|
||||
- }
|
||||
-
|
||||
i = conflicting_memblks(start, end);
|
||||
if (i == node) {
|
||||
printk(KERN_WARNING
|
||||
@@ -287,8 +281,11 @@ acpi_numa_memory_affinity_init(struct ac
|
||||
if (nd->end < end)
|
||||
nd->end = end;
|
||||
}
|
||||
- printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"\n", node, pxm,
|
||||
- start, end);
|
||||
+ if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && end > mem_hotplug)
|
||||
+ mem_hotplug = end;
|
||||
+ printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"%s\n",
|
||||
+ node, pxm, start, end,
|
||||
+ ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE ? " (hotplug)" : "");
|
||||
|
||||
node_memblk_range[num_node_memblks].start = start;
|
||||
node_memblk_range[num_node_memblks].end = end;
|
||||
--- a/xen/arch/x86/x86_64/mm.c
|
||||
+++ b/xen/arch/x86/x86_64/mm.c
|
||||
@@ -621,25 +621,20 @@ void __init paging_init(void)
|
||||
* We setup the L3s for 1:1 mapping if host support memory hotplug
|
||||
* to avoid sync the 1:1 mapping on page fault handler
|
||||
*/
|
||||
- if ( mem_hotplug )
|
||||
+ for ( va = DIRECTMAP_VIRT_START;
|
||||
+ va < DIRECTMAP_VIRT_END && (void *)va < __va(mem_hotplug);
|
||||
+ va += (1UL << L4_PAGETABLE_SHIFT) )
|
||||
{
|
||||
- unsigned long va;
|
||||
-
|
||||
- for ( va = DIRECTMAP_VIRT_START;
|
||||
- va < DIRECTMAP_VIRT_END;
|
||||
- va += (1UL << L4_PAGETABLE_SHIFT) )
|
||||
+ if ( !(l4e_get_flags(idle_pg_table[l4_table_offset(va)]) &
|
||||
+ _PAGE_PRESENT) )
|
||||
{
|
||||
- if ( !(l4e_get_flags(idle_pg_table[l4_table_offset(va)]) &
|
||||
- _PAGE_PRESENT) )
|
||||
- {
|
||||
- l3_pg = alloc_domheap_page(NULL, 0);
|
||||
- if ( !l3_pg )
|
||||
- goto nomem;
|
||||
- l3_ro_mpt = page_to_virt(l3_pg);
|
||||
- clear_page(l3_ro_mpt);
|
||||
- l4e_write(&idle_pg_table[l4_table_offset(va)],
|
||||
- l4e_from_page(l3_pg, __PAGE_HYPERVISOR));
|
||||
- }
|
||||
+ l3_pg = alloc_domheap_page(NULL, 0);
|
||||
+ if ( !l3_pg )
|
||||
+ goto nomem;
|
||||
+ l3_ro_mpt = page_to_virt(l3_pg);
|
||||
+ clear_page(l3_ro_mpt);
|
||||
+ l4e_write(&idle_pg_table[l4_table_offset(va)],
|
||||
+ l4e_from_page(l3_pg, __PAGE_HYPERVISOR));
|
||||
}
|
||||
}
|
||||
|
||||
--- a/xen/include/asm-x86/mm.h
|
||||
+++ b/xen/include/asm-x86/mm.h
|
||||
@@ -399,7 +399,7 @@ static inline int get_page_and_type(stru
|
||||
int check_descriptor(const struct domain *, struct desc_struct *d);
|
||||
|
||||
extern bool_t opt_allow_superpage;
|
||||
-extern bool_t mem_hotplug;
|
||||
+extern paddr_t mem_hotplug;
|
||||
|
||||
/******************************************************************************
|
||||
* With shadow pagetables, the different kinds of address start
|
@ -1,82 +0,0 @@
|
||||
# Commit 0aa27ce3351f7eb09d13e863a1d5f303086aa32a
|
||||
# Date 2013-10-04 12:23:23 +0200
|
||||
# Author Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/idle: Fix get_cpu_idle_time()'s interaction with offline pcpus
|
||||
|
||||
Checking for "idle_vcpu[cpu] != NULL" is insufficient protection against
|
||||
offline pcpus. From a hypercall, vcpu_runstate_get() will determine "v !=
|
||||
current", and try to take the vcpu_schedule_lock(). This will try to look up
|
||||
per_cpu(schedule_data, v->processor) and promptly suffer a NULL structure
|
||||
deference as v->processors' __per_cpu_offset is INVALID_PERCPU_AREA.
|
||||
|
||||
One example might look like this:
|
||||
|
||||
...
|
||||
Xen call trace:
|
||||
[<ffff82c4c0126ddb>] vcpu_runstate_get+0x50/0x113
|
||||
[<ffff82c4c0126ec6>] get_cpu_idle_time+0x28/0x2e
|
||||
[<ffff82c4c012b5cb>] do_sysctl+0x3db/0xeb8
|
||||
[<ffff82c4c023280d>] compat_hypercall+0xbd/0x116
|
||||
|
||||
Pagetable walk from 0000000000000040:
|
||||
L4[0x000] = 0000000186df8027 0000000000028207
|
||||
L3[0x000] = 0000000188e36027 00000000000261c9
|
||||
L2[0x000] = 0000000000000000 ffffffffffffffff
|
||||
|
||||
****************************************
|
||||
Panic on CPU 11:
|
||||
...
|
||||
|
||||
get_cpu_idle_time() has been updated to correctly deal with offline pcpus
|
||||
itself by returning 0, in the same way as it would if it was missing the
|
||||
idle_vcpu[] pointer.
|
||||
|
||||
In doing so, XENPF_getidletime needed updating to correctly retain its
|
||||
described behaviour of clearing bits in the cpumap for offline pcpus.
|
||||
|
||||
As this crash can only be triggered with toolstack hypercalls, it is not a
|
||||
security issue and just a simple bug.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/platform_hypercall.c
|
||||
+++ b/xen/arch/x86/platform_hypercall.c
|
||||
@@ -355,10 +355,14 @@ ret_t do_platform_op(XEN_GUEST_HANDLE_PA
|
||||
|
||||
for_each_cpu ( cpu, cpumap )
|
||||
{
|
||||
- if ( idle_vcpu[cpu] == NULL )
|
||||
- cpumask_clear_cpu(cpu, cpumap);
|
||||
idletime = get_cpu_idle_time(cpu);
|
||||
|
||||
+ if ( !idletime )
|
||||
+ {
|
||||
+ cpumask_clear_cpu(cpu, cpumap);
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
if ( copy_to_guest_offset(idletimes, cpu, &idletime, 1) )
|
||||
{
|
||||
ret = -EFAULT;
|
||||
--- a/xen/common/schedule.c
|
||||
+++ b/xen/common/schedule.c
|
||||
@@ -176,13 +176,12 @@ void vcpu_runstate_get(struct vcpu *v, s
|
||||
|
||||
uint64_t get_cpu_idle_time(unsigned int cpu)
|
||||
{
|
||||
- struct vcpu_runstate_info state;
|
||||
- struct vcpu *v;
|
||||
+ struct vcpu_runstate_info state = { 0 };
|
||||
+ struct vcpu *v = idle_vcpu[cpu];
|
||||
|
||||
- if ( (v = idle_vcpu[cpu]) == NULL )
|
||||
- return 0;
|
||||
+ if ( cpu_online(cpu) && v )
|
||||
+ vcpu_runstate_get(v, &state);
|
||||
|
||||
- vcpu_runstate_get(v, &state);
|
||||
return state.time[RUNSTATE_running];
|
||||
}
|
||||
|
@ -1,35 +0,0 @@
|
||||
# Commit 7cfb0053629c4dd1a6f01dc43cca7c0c25b8b7bf
|
||||
# Date 2013-10-04 12:24:34 +0200
|
||||
# Author Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/percpu: Force INVALID_PERCPU_AREA into the non-canonical address region
|
||||
|
||||
This causes accidental uses of per_cpu() on a pcpu with an INVALID_PERCPU_AREA
|
||||
to result in a #GF for attempting to access the middle of the non-canonical
|
||||
virtual address region.
|
||||
|
||||
This is preferable to the current behaviour, where incorrect use of per_cpu()
|
||||
will result in an effective NULL structure dereference which has security
|
||||
implication in the context of PV guests.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/percpu.c
|
||||
+++ b/xen/arch/x86/percpu.c
|
||||
@@ -6,7 +6,14 @@
|
||||
#include <xen/rcupdate.h>
|
||||
|
||||
unsigned long __per_cpu_offset[NR_CPUS];
|
||||
-#define INVALID_PERCPU_AREA (-(long)__per_cpu_start)
|
||||
+
|
||||
+/*
|
||||
+ * Force uses of per_cpu() with an invalid area to attempt to access the
|
||||
+ * middle of the non-canonical address space resulting in a #GP, rather than a
|
||||
+ * possible #PF at (NULL + a little) which has security implications in the
|
||||
+ * context of PV guests.
|
||||
+ */
|
||||
+#define INVALID_PERCPU_AREA (0x8000000000000000L - (long)__per_cpu_start)
|
||||
#define PERCPU_ORDER (get_order_from_bytes(__per_cpu_data_end-__per_cpu_start))
|
||||
|
||||
void __init percpu_init_areas(void)
|
@ -1,82 +0,0 @@
|
||||
# Commit 190b667ac20e8175758f4a3a0f13c4d990e6af7e
|
||||
# Date 2013-10-04 12:28:14 +0200
|
||||
# Author Yang Zhang <yang.z.zhang@Intel.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
Nested VMX: check VMX capability before read VMX related MSRs
|
||||
|
||||
VMX MSRs only available when the CPU support the VMX feature. In addition,
|
||||
VMX_TRUE* MSRs only available when bit 55 of VMX_BASIC MSR is set.
|
||||
|
||||
Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
|
||||
|
||||
Cleanup.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Jun Nakajima <jun.nakajima@intel.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/vmx/vmcs.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
|
||||
@@ -78,6 +78,7 @@ static DEFINE_PER_CPU(struct list_head,
|
||||
static DEFINE_PER_CPU(bool_t, vmxon);
|
||||
|
||||
static u32 vmcs_revision_id __read_mostly;
|
||||
+u64 __read_mostly vmx_basic_msr;
|
||||
|
||||
static void __init vmx_display_features(void)
|
||||
{
|
||||
@@ -301,6 +302,8 @@ static int vmx_init_vmcs_config(void)
|
||||
vmx_vmexit_control = _vmx_vmexit_control;
|
||||
vmx_vmentry_control = _vmx_vmentry_control;
|
||||
cpu_has_vmx_ins_outs_instr_info = !!(vmx_basic_msr_high & (1U<<22));
|
||||
+ vmx_basic_msr = ((u64)vmx_basic_msr_high << 32) |
|
||||
+ vmx_basic_msr_low;
|
||||
vmx_display_features();
|
||||
}
|
||||
else
|
||||
--- a/xen/arch/x86/hvm/vmx/vvmx.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
|
||||
@@ -1814,12 +1814,33 @@ int nvmx_handle_invvpid(struct cpu_user_
|
||||
int nvmx_msr_read_intercept(unsigned int msr, u64 *msr_content)
|
||||
{
|
||||
struct vcpu *v = current;
|
||||
+ unsigned int ecx, dummy;
|
||||
u64 data = 0, host_data = 0;
|
||||
int r = 1;
|
||||
|
||||
if ( !nestedhvm_enabled(v->domain) )
|
||||
return 0;
|
||||
|
||||
+ /* VMX capablity MSRs are available only when guest supports VMX. */
|
||||
+ hvm_cpuid(0x1, &dummy, &dummy, &ecx, &dummy);
|
||||
+ if ( !(ecx & cpufeat_mask(X86_FEATURE_VMXE)) )
|
||||
+ return 0;
|
||||
+
|
||||
+ /*
|
||||
+ * Those MSRs are available only when bit 55 of
|
||||
+ * MSR_IA32_VMX_BASIC is set.
|
||||
+ */
|
||||
+ switch ( msr )
|
||||
+ {
|
||||
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
|
||||
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
|
||||
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
|
||||
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
|
||||
+ if ( !(vmx_basic_msr & VMX_BASIC_DEFAULT1_ZERO) )
|
||||
+ return 0;
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
rdmsrl(msr, host_data);
|
||||
|
||||
/*
|
||||
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
|
||||
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
|
||||
@@ -284,6 +284,8 @@ extern bool_t cpu_has_vmx_ins_outs_instr
|
||||
*/
|
||||
#define VMX_BASIC_DEFAULT1_ZERO (1ULL << 55)
|
||||
|
||||
+extern u64 vmx_basic_msr;
|
||||
+
|
||||
/* Guest interrupt status */
|
||||
#define VMX_GUEST_INTR_STATUS_SUBFIELD_BITMASK 0x0FF
|
||||
#define VMX_GUEST_INTR_STATUS_SVI_OFFSET 8
|
@ -1,115 +0,0 @@
|
||||
# Commit c6f92aed0e209df823d2cb5780dbb1ea12fc6d4a
|
||||
# Date 2013-10-04 12:30:09 +0200
|
||||
# Author Yang Zhang <yang.z.zhang@Intel.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
Nested VMX: fix IA32_VMX_CR4_FIXED1 msr emulation
|
||||
|
||||
Currently, it use hardcode value for IA32_VMX_CR4_FIXED1. This is wrong.
|
||||
We should check guest's cpuid to know which bits are writeable in CR4 by guest
|
||||
and allow the guest to set the corresponding bit only when guest has the feature.
|
||||
|
||||
Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
|
||||
|
||||
Cleanup.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Jun Nakajima <jun.nakajima@intel.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/vmx/vvmx.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
|
||||
@@ -1814,7 +1814,7 @@ int nvmx_handle_invvpid(struct cpu_user_
|
||||
int nvmx_msr_read_intercept(unsigned int msr, u64 *msr_content)
|
||||
{
|
||||
struct vcpu *v = current;
|
||||
- unsigned int ecx, dummy;
|
||||
+ unsigned int eax, ebx, ecx, edx, dummy;
|
||||
u64 data = 0, host_data = 0;
|
||||
int r = 1;
|
||||
|
||||
@@ -1822,7 +1822,7 @@ int nvmx_msr_read_intercept(unsigned int
|
||||
return 0;
|
||||
|
||||
/* VMX capablity MSRs are available only when guest supports VMX. */
|
||||
- hvm_cpuid(0x1, &dummy, &dummy, &ecx, &dummy);
|
||||
+ hvm_cpuid(0x1, &dummy, &dummy, &ecx, &edx);
|
||||
if ( !(ecx & cpufeat_mask(X86_FEATURE_VMXE)) )
|
||||
return 0;
|
||||
|
||||
@@ -1946,8 +1946,55 @@ int nvmx_msr_read_intercept(unsigned int
|
||||
data = X86_CR4_VMXE;
|
||||
break;
|
||||
case MSR_IA32_VMX_CR4_FIXED1:
|
||||
- /* allow 0-settings except SMXE */
|
||||
- data = 0x267ff & ~X86_CR4_SMXE;
|
||||
+ if ( edx & cpufeat_mask(X86_FEATURE_VME) )
|
||||
+ data |= X86_CR4_VME | X86_CR4_PVI;
|
||||
+ if ( edx & cpufeat_mask(X86_FEATURE_TSC) )
|
||||
+ data |= X86_CR4_TSD;
|
||||
+ if ( edx & cpufeat_mask(X86_FEATURE_DE) )
|
||||
+ data |= X86_CR4_DE;
|
||||
+ if ( edx & cpufeat_mask(X86_FEATURE_PSE) )
|
||||
+ data |= X86_CR4_PSE;
|
||||
+ if ( edx & cpufeat_mask(X86_FEATURE_PAE) )
|
||||
+ data |= X86_CR4_PAE;
|
||||
+ if ( edx & cpufeat_mask(X86_FEATURE_MCE) )
|
||||
+ data |= X86_CR4_MCE;
|
||||
+ if ( edx & cpufeat_mask(X86_FEATURE_PGE) )
|
||||
+ data |= X86_CR4_PGE;
|
||||
+ if ( edx & cpufeat_mask(X86_FEATURE_FXSR) )
|
||||
+ data |= X86_CR4_OSFXSR;
|
||||
+ if ( edx & cpufeat_mask(X86_FEATURE_XMM) )
|
||||
+ data |= X86_CR4_OSXMMEXCPT;
|
||||
+ if ( ecx & cpufeat_mask(X86_FEATURE_VMXE) )
|
||||
+ data |= X86_CR4_VMXE;
|
||||
+ if ( ecx & cpufeat_mask(X86_FEATURE_SMXE) )
|
||||
+ data |= X86_CR4_SMXE;
|
||||
+ if ( ecx & cpufeat_mask(X86_FEATURE_PCID) )
|
||||
+ data |= X86_CR4_PCIDE;
|
||||
+ if ( ecx & cpufeat_mask(X86_FEATURE_XSAVE) )
|
||||
+ data |= X86_CR4_OSXSAVE;
|
||||
+
|
||||
+ hvm_cpuid(0x0, &eax, &dummy, &dummy, &dummy);
|
||||
+ switch ( eax )
|
||||
+ {
|
||||
+ default:
|
||||
+ hvm_cpuid(0xa, &eax, &dummy, &dummy, &dummy);
|
||||
+ /* Check whether guest has the perf monitor feature. */
|
||||
+ if ( (eax & 0xff) && (eax & 0xff00) )
|
||||
+ data |= X86_CR4_PCE;
|
||||
+ /* fall through */
|
||||
+ case 0x7 ... 0x9:
|
||||
+ ecx = 0;
|
||||
+ hvm_cpuid(0x7, &dummy, &ebx, &ecx, &dummy);
|
||||
+ if ( ebx & cpufeat_mask(X86_FEATURE_FSGSBASE) )
|
||||
+ data |= X86_CR4_FSGSBASE;
|
||||
+ if ( ebx & cpufeat_mask(X86_FEATURE_SMEP) )
|
||||
+ data |= X86_CR4_SMEP;
|
||||
+ if ( ebx & cpufeat_mask(X86_FEATURE_SMAP) )
|
||||
+ data |= X86_CR4_SMAP;
|
||||
+ /* fall through */
|
||||
+ case 0x0 ... 0x6:
|
||||
+ break;
|
||||
+ }
|
||||
break;
|
||||
case MSR_IA32_VMX_MISC:
|
||||
/* Do not support CR3-target feature now */
|
||||
--- a/xen/include/asm-x86/cpufeature.h
|
||||
+++ b/xen/include/asm-x86/cpufeature.h
|
||||
@@ -148,6 +148,7 @@
|
||||
#define X86_FEATURE_INVPCID (7*32+10) /* Invalidate Process Context ID */
|
||||
#define X86_FEATURE_RTM (7*32+11) /* Restricted Transactional Memory */
|
||||
#define X86_FEATURE_NO_FPU_SEL (7*32+13) /* FPU CS/DS stored as zero */
|
||||
+#define X86_FEATURE_SMAP (7*32+20) /* Supervisor Mode Access Prevention */
|
||||
|
||||
#define cpu_has(c, bit) test_bit(bit, (c)->x86_capability)
|
||||
#define boot_cpu_has(bit) test_bit(bit, boot_cpu_data.x86_capability)
|
||||
--- a/xen/include/asm-x86/processor.h
|
||||
+++ b/xen/include/asm-x86/processor.h
|
||||
@@ -87,6 +87,7 @@
|
||||
#define X86_CR4_PCIDE 0x20000 /* enable PCID */
|
||||
#define X86_CR4_OSXSAVE 0x40000 /* enable XSAVE/XRSTOR */
|
||||
#define X86_CR4_SMEP 0x100000/* enable SMEP */
|
||||
+#define X86_CR4_SMAP 0x200000/* enable SMAP */
|
||||
|
||||
/*
|
||||
* Trap/fault mnemonics.
|
@ -1,28 +0,0 @@
|
||||
# Commit 65ba631bcb62c79eb33ebfde8a0471fd012c37a8
|
||||
# Date 2013-10-04 12:51:44 +0200
|
||||
# Author Daniel De Graaf <dgdegra@tycho.nsa.gov>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
xsm: forbid PV guest console reads
|
||||
|
||||
The CONSOLEIO_read operation was incorrectly allowed to PV guests if the
|
||||
hypervisor was compiled in debug mode (with VERBOSE defined).
|
||||
|
||||
Reported-by: Jan Beulich <jbeulich@suse.com>
|
||||
Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov>
|
||||
|
||||
--- a/xen/include/xsm/dummy.h
|
||||
+++ b/xen/include/xsm/dummy.h
|
||||
@@ -222,10 +222,10 @@ static XSM_INLINE int xsm_console_io(XSM
|
||||
{
|
||||
XSM_ASSERT_ACTION(XSM_OTHER);
|
||||
#ifdef VERBOSE
|
||||
- return xsm_default_action(XSM_HOOK, current->domain, NULL);
|
||||
-#else
|
||||
- return xsm_default_action(XSM_PRIV, current->domain, NULL);
|
||||
+ if ( cmd == CONSOLEIO_write )
|
||||
+ return xsm_default_action(XSM_HOOK, d, NULL);
|
||||
#endif
|
||||
+ return xsm_default_action(XSM_PRIV, d, NULL);
|
||||
}
|
||||
|
||||
static XSM_INLINE int xsm_profile(XSM_DEFAULT_ARG struct domain *d, int op)
|
@ -1,43 +0,0 @@
|
||||
References: bnc#842511 CVE-2013-4368 XSA-67
|
||||
|
||||
# Commit 0771faba163769089c9f05f7f76b63e397677613
|
||||
# Date 2013-10-10 15:19:53 +0200
|
||||
# Author Matthew Daley <mattjd@gmail.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: check segment descriptor read result in 64-bit OUTS emulation
|
||||
|
||||
When emulating such an operation from a 64-bit context (CS has long
|
||||
mode set), and the data segment is overridden to FS/GS, the result of
|
||||
reading the overridden segment's descriptor (read_descriptor) is not
|
||||
checked. If it fails, data_base is left uninitialized.
|
||||
|
||||
This can lead to 8 bytes of Xen's stack being leaked to the guest
|
||||
(implicitly, i.e. via the address given in a #PF).
|
||||
|
||||
Coverity-ID: 1055116
|
||||
|
||||
This is CVE-2013-4368 / XSA-67.
|
||||
|
||||
Signed-off-by: Matthew Daley <mattjd@gmail.com>
|
||||
|
||||
Fix formatting.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/xen/arch/x86/traps.c
|
||||
+++ b/xen/arch/x86/traps.c
|
||||
@@ -1990,10 +1990,10 @@ static int emulate_privileged_op(struct
|
||||
break;
|
||||
}
|
||||
}
|
||||
- else
|
||||
- read_descriptor(data_sel, v, regs,
|
||||
- &data_base, &data_limit, &ar,
|
||||
- 0);
|
||||
+ else if ( !read_descriptor(data_sel, v, regs,
|
||||
+ &data_base, &data_limit, &ar, 0) ||
|
||||
+ !(ar & _SEGMENT_S) || !(ar & _SEGMENT_P) )
|
||||
+ goto fail;
|
||||
data_limit = ~0UL;
|
||||
ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
|
||||
}
|
@ -1,71 +0,0 @@
|
||||
References: bnc#842512 CVE-2013-4369 XSA-68
|
||||
|
||||
# Commit c53702cee1d6f9f1b72f0cae0b412e21bcda8724
|
||||
# Date 2013-10-10 15:48:55 +0100
|
||||
# Author Ian Jackson <ian.jackson@eu.citrix.com>
|
||||
# Committer Ian Jackson <Ian.Jackson@eu.citrix.com>
|
||||
libxl: fix vif rate parsing
|
||||
|
||||
strtok can return NULL here. We don't need to use strtok anyway, so just
|
||||
use a simple strchr method.
|
||||
|
||||
Coverity-ID: 1055642
|
||||
|
||||
This is CVE-2013-4369 / XSA-68
|
||||
|
||||
Signed-off-by: Matthew Daley <mattjd@gmail.com>
|
||||
|
||||
Fix type. Add test case
|
||||
|
||||
Signed-off-by: Ian Campbell <Ian.campbell@citrix.com>
|
||||
|
||||
--- a/tools/libxl/check-xl-vif-parse
|
||||
+++ b/tools/libxl/check-xl-vif-parse
|
||||
@@ -206,4 +206,8 @@ expected </dev/null
|
||||
one $e rate=4294967295GB/s@5us
|
||||
one $e rate=4296MB/s@4294s
|
||||
|
||||
+# test include of single '@'
|
||||
+expected </dev/null
|
||||
+one $e rate=@
|
||||
+
|
||||
complete
|
||||
--- a/tools/libxl/libxlu_vif.c
|
||||
+++ b/tools/libxl/libxlu_vif.c
|
||||
@@ -95,23 +95,30 @@ int xlu_vif_parse_rate(XLU_Config *cfg,
|
||||
uint64_t bytes_per_sec = 0;
|
||||
uint64_t bytes_per_interval = 0;
|
||||
uint32_t interval_usecs = 50000UL; /* Default to 50ms */
|
||||
- char *ratetok, *tmprate;
|
||||
+ char *p, *tmprate;
|
||||
int rc = 0;
|
||||
|
||||
tmprate = strdup(rate);
|
||||
+ if (tmprate == NULL) {
|
||||
+ rc = ENOMEM;
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ p = strchr(tmprate, '@');
|
||||
+ if (p != NULL)
|
||||
+ *p++ = 0;
|
||||
+
|
||||
if (!strcmp(tmprate,"")) {
|
||||
xlu__vif_err(cfg, "no rate specified", rate);
|
||||
rc = EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
- ratetok = strtok(tmprate, "@");
|
||||
- rc = vif_parse_rate_bytes_per_sec(cfg, ratetok, &bytes_per_sec);
|
||||
+ rc = vif_parse_rate_bytes_per_sec(cfg, tmprate, &bytes_per_sec);
|
||||
if (rc) goto out;
|
||||
|
||||
- ratetok = strtok(NULL, "@");
|
||||
- if (ratetok != NULL) {
|
||||
- rc = vif_parse_rate_interval_usecs(cfg, ratetok, &interval_usecs);
|
||||
+ if (p != NULL) {
|
||||
+ rc = vif_parse_rate_interval_usecs(cfg, p, &interval_usecs);
|
||||
if (rc) goto out;
|
||||
}
|
||||
|
@ -1,28 +0,0 @@
|
||||
References: bnc#842513 CVE-2013-4370 XSA-69
|
||||
|
||||
# Commit 3cd10fd21220f2b814324e6e732004f8f0487d0a
|
||||
# Date 2013-10-10 15:49:40 +0100
|
||||
# Author Matthew Daley <mattjd@gmail.com>
|
||||
# Committer Ian Jackson <Ian.Jackson@eu.citrix.com>
|
||||
tools/ocaml: fix erroneous free of cpumap in stub_xc_vcpu_getaffinity
|
||||
|
||||
Not sure how it got there...
|
||||
|
||||
Coverity-ID: 1056196
|
||||
|
||||
This is CVE-2013-4370 / XSA-69
|
||||
|
||||
Signed-off-by: Matthew Daley <mattjd@gmail.com>
|
||||
Acked-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
|
||||
--- a/tools/ocaml/libs/xc/xenctrl_stubs.c
|
||||
+++ b/tools/ocaml/libs/xc/xenctrl_stubs.c
|
||||
@@ -461,8 +461,6 @@ CAMLprim value stub_xc_vcpu_getaffinity(
|
||||
|
||||
retval = xc_vcpu_getaffinity(_H(xch), _D(domid),
|
||||
Int_val(vcpu), c_cpumap);
|
||||
- free(c_cpumap);
|
||||
-
|
||||
if (retval < 0) {
|
||||
free(c_cpumap);
|
||||
failwith_xc(_H(xch));
|
@ -1,28 +0,0 @@
|
||||
References: bnc#842514 CVE-2013-4371 XSA-70
|
||||
|
||||
# Commit 4c37ed562224295c0f8b00211287d57cae629782
|
||||
# Date 2013-10-10 15:49:54 +0100
|
||||
# Author Matthew Daley <mattjd@gmail.com>
|
||||
# Committer Ian Jackson <Ian.Jackson@eu.citrix.com>
|
||||
libxl: fix out-of-memory error handling in libxl_list_cpupool
|
||||
|
||||
...otherwise it will return freed memory. All the current users of this
|
||||
function check already for a NULL return, so use that.
|
||||
|
||||
Coverity-ID: 1056194
|
||||
|
||||
This is CVE-2013-4371 / XSA-70
|
||||
|
||||
Signed-off-by: Matthew Daley <mattjd@gmail.com>
|
||||
Acked-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
|
||||
--- a/tools/libxl/libxl.c
|
||||
+++ b/tools/libxl/libxl.c
|
||||
@@ -649,6 +649,7 @@ libxl_cpupoolinfo * libxl_list_cpupool(l
|
||||
if (!tmp) {
|
||||
LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "allocating cpupool info");
|
||||
libxl_cpupoolinfo_list_free(ptr, i);
|
||||
+ ptr = NULL;
|
||||
goto out;
|
||||
}
|
||||
ptr = tmp;
|
@ -1,176 +0,0 @@
|
||||
# Commit 40d66baa46ca8a9ffa6df3e063a967d08ec92bcf
|
||||
# Date 2013-10-11 09:28:26 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: correct LDT checks
|
||||
|
||||
- MMUEXT_SET_LDT should behave as similarly to the LLDT instruction as
|
||||
possible: fail only if the base address is non-canonical
|
||||
- instead LDT descriptor accesses should fault if the descriptor
|
||||
address ends up being non-canonical (by ensuring this we at once
|
||||
avoid reading an entry from the mach-to-phys table and consider it a
|
||||
page table entry)
|
||||
- fault propagation on using LDT selectors must distinguish #PF and #GP
|
||||
(the latter must be raised for a non-canonical descriptor address,
|
||||
which also applies to several other uses of propagate_page_fault(),
|
||||
and hence the problem is being fixed there)
|
||||
- map_ldt_shadow_page() should properly wrap addresses for 32-bit VMs
|
||||
|
||||
At once remove the odd invokation of map_ldt_shadow_page() from the
|
||||
MMUEXT_SET_LDT handler: There's nothing really telling us that the
|
||||
first LDT page is going to be preferred over others.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/domain.c
|
||||
+++ b/xen/arch/x86/domain.c
|
||||
@@ -674,12 +674,7 @@ int arch_set_info_guest(
|
||||
fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
|
||||
}
|
||||
|
||||
- /* LDT safety checks. */
|
||||
- if ( ((c.nat->ldt_base & (PAGE_SIZE-1)) != 0) ||
|
||||
- (c.nat->ldt_ents > 8192) ||
|
||||
- !array_access_ok(c.nat->ldt_base,
|
||||
- c.nat->ldt_ents,
|
||||
- LDT_ENTRY_SIZE) )
|
||||
+ if ( !__addr_ok(c.nat->ldt_base) )
|
||||
return -EINVAL;
|
||||
}
|
||||
else
|
||||
@@ -692,15 +687,12 @@ int arch_set_info_guest(
|
||||
|
||||
for ( i = 0; i < ARRAY_SIZE(c.cmp->trap_ctxt); i++ )
|
||||
fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
|
||||
-
|
||||
- /* LDT safety checks. */
|
||||
- if ( ((c.cmp->ldt_base & (PAGE_SIZE-1)) != 0) ||
|
||||
- (c.cmp->ldt_ents > 8192) ||
|
||||
- !compat_array_access_ok(c.cmp->ldt_base,
|
||||
- c.cmp->ldt_ents,
|
||||
- LDT_ENTRY_SIZE) )
|
||||
- return -EINVAL;
|
||||
}
|
||||
+
|
||||
+ /* LDT safety checks. */
|
||||
+ if ( ((c(ldt_base) & (PAGE_SIZE - 1)) != 0) ||
|
||||
+ (c(ldt_ents) > 8192) )
|
||||
+ return -EINVAL;
|
||||
}
|
||||
|
||||
v->fpu_initialised = !!(flags & VGCF_I387_VALID);
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -582,6 +582,8 @@ int map_ldt_shadow_page(unsigned int off
|
||||
|
||||
BUG_ON(unlikely(in_irq()));
|
||||
|
||||
+ if ( is_pv_32bit_domain(d) )
|
||||
+ gva = (u32)gva;
|
||||
guest_get_eff_kern_l1e(v, gva, &l1e);
|
||||
if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
|
||||
return 0;
|
||||
@@ -3229,9 +3231,8 @@ long do_mmuext_op(
|
||||
MEM_LOG("ignoring SET_LDT hypercall from external domain");
|
||||
okay = 0;
|
||||
}
|
||||
- else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
|
||||
- (ents > 8192) ||
|
||||
- !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
|
||||
+ else if ( ((ptr & (PAGE_SIZE - 1)) != 0) || !__addr_ok(ptr) ||
|
||||
+ (ents > 8192) )
|
||||
{
|
||||
okay = 0;
|
||||
MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
|
||||
@@ -3244,8 +3245,6 @@ long do_mmuext_op(
|
||||
curr->arch.pv_vcpu.ldt_base = ptr;
|
||||
curr->arch.pv_vcpu.ldt_ents = ents;
|
||||
load_LDT(curr);
|
||||
- if ( ents != 0 )
|
||||
- (void)map_ldt_shadow_page(0);
|
||||
}
|
||||
break;
|
||||
}
|
||||
--- a/xen/arch/x86/traps.c
|
||||
+++ b/xen/arch/x86/traps.c
|
||||
@@ -1070,12 +1070,24 @@ static void reserved_bit_page_fault(
|
||||
show_execution_state(regs);
|
||||
}
|
||||
|
||||
-void propagate_page_fault(unsigned long addr, u16 error_code)
|
||||
+struct trap_bounce *propagate_page_fault(unsigned long addr, u16 error_code)
|
||||
{
|
||||
struct trap_info *ti;
|
||||
struct vcpu *v = current;
|
||||
struct trap_bounce *tb = &v->arch.pv_vcpu.trap_bounce;
|
||||
|
||||
+ if ( unlikely(!is_canonical_address(addr)) )
|
||||
+ {
|
||||
+ ti = &v->arch.pv_vcpu.trap_ctxt[TRAP_gp_fault];
|
||||
+ tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
|
||||
+ tb->error_code = 0;
|
||||
+ tb->cs = ti->cs;
|
||||
+ tb->eip = ti->address;
|
||||
+ if ( TI_GET_IF(ti) )
|
||||
+ tb->flags |= TBF_INTERRUPT;
|
||||
+ return tb;
|
||||
+ }
|
||||
+
|
||||
v->arch.pv_vcpu.ctrlreg[2] = addr;
|
||||
arch_set_cr2(v, addr);
|
||||
|
||||
@@ -1102,6 +1114,8 @@ void propagate_page_fault(unsigned long
|
||||
|
||||
if ( unlikely(error_code & PFEC_reserved_bit) )
|
||||
reserved_bit_page_fault(addr, guest_cpu_user_regs());
|
||||
+
|
||||
+ return NULL;
|
||||
}
|
||||
|
||||
static int handle_gdt_ldt_mapping_fault(
|
||||
@@ -1135,13 +1149,16 @@ static int handle_gdt_ldt_mapping_fault(
|
||||
}
|
||||
else
|
||||
{
|
||||
+ struct trap_bounce *tb;
|
||||
+
|
||||
/* In hypervisor mode? Leave it to the #PF handler to fix up. */
|
||||
if ( !guest_mode(regs) )
|
||||
return 0;
|
||||
- /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
|
||||
- propagate_page_fault(
|
||||
- curr->arch.pv_vcpu.ldt_base + offset,
|
||||
- regs->error_code);
|
||||
+ /* In guest mode? Propagate fault to guest, with adjusted %cr2. */
|
||||
+ tb = propagate_page_fault(curr->arch.pv_vcpu.ldt_base + offset,
|
||||
+ regs->error_code);
|
||||
+ if ( tb )
|
||||
+ tb->error_code = ((u16)offset & ~3) | 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
--- a/xen/include/asm-x86/mm.h
|
||||
+++ b/xen/include/asm-x86/mm.h
|
||||
@@ -555,7 +555,7 @@ int new_guest_cr3(unsigned long pfn);
|
||||
void make_cr3(struct vcpu *v, unsigned long mfn);
|
||||
void update_cr3(struct vcpu *v);
|
||||
int vcpu_destroy_pagetables(struct vcpu *);
|
||||
-void propagate_page_fault(unsigned long addr, u16 error_code);
|
||||
+struct trap_bounce *propagate_page_fault(unsigned long addr, u16 error_code);
|
||||
void *do_page_walk(struct vcpu *v, unsigned long addr);
|
||||
|
||||
int __sync_local_execstate(void);
|
||||
--- a/xen/include/asm-x86/paging.h
|
||||
+++ b/xen/include/asm-x86/paging.h
|
||||
@@ -386,7 +386,8 @@ guest_get_eff_l1e(struct vcpu *v, unsign
|
||||
if ( likely(!paging_mode_translate(v->domain)) )
|
||||
{
|
||||
ASSERT(!paging_mode_external(v->domain));
|
||||
- if ( __copy_from_user(eff_l1e,
|
||||
+ if ( !__addr_ok(addr) ||
|
||||
+ __copy_from_user(eff_l1e,
|
||||
&__linear_l1_table[l1_linear_offset(addr)],
|
||||
sizeof(l1_pgentry_t)) != 0 )
|
||||
*(l1_pgentry_t *)eff_l1e = l1e_empty();
|
@ -1,26 +0,0 @@
|
||||
# Commit d06a0d715ec1423b6c42141ab1b0ff69a3effb56
|
||||
# Date 2013-10-11 09:29:43 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: add address validity check to guest_map_l1e()
|
||||
|
||||
Just like for guest_get_eff_l1e() this prevents accessing as page
|
||||
tables (and with the wrong memory attribute) internal data inside Xen
|
||||
happening to be mapped with 1Gb pages.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper@citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/include/asm-x86/paging.h
|
||||
+++ b/xen/include/asm-x86/paging.h
|
||||
@@ -360,7 +360,8 @@ guest_map_l1e(struct vcpu *v, unsigned l
|
||||
return paging_get_hostmode(v)->guest_map_l1e(v, addr, gl1mfn);
|
||||
|
||||
/* Find this l1e and its enclosing l1mfn in the linear map */
|
||||
- if ( __copy_from_user(&l2e,
|
||||
+ if ( !__addr_ok(addr) ||
|
||||
+ __copy_from_user(&l2e,
|
||||
&__linear_l2_table[l2_linear_offset(addr)],
|
||||
sizeof(l2_pgentry_t)) != 0 )
|
||||
return NULL;
|
@ -1,38 +0,0 @@
|
||||
# Commit 6fd9b0361e2eb5a7f12bdd5cbf7e42c0d1937d26
|
||||
# Date 2013-10-11 09:31:16 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: check for canonical address before doing page walks
|
||||
|
||||
... as there doesn't really exists any valid mapping for them.
|
||||
|
||||
Particularly in the case of do_page_walk() this also avoids returning
|
||||
non-NULL for such invalid input.
|
||||
|
||||
Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/x86_64/mm.c
|
||||
+++ b/xen/arch/x86/x86_64/mm.c
|
||||
@@ -135,7 +135,7 @@ void *do_page_walk(struct vcpu *v, unsig
|
||||
l2_pgentry_t l2e, *l2t;
|
||||
l1_pgentry_t l1e, *l1t;
|
||||
|
||||
- if ( is_hvm_vcpu(v) )
|
||||
+ if ( is_hvm_vcpu(v) || !is_canonical_address(addr) )
|
||||
return NULL;
|
||||
|
||||
l4t = map_domain_page(mfn);
|
||||
--- a/xen/arch/x86/x86_64/traps.c
|
||||
+++ b/xen/arch/x86/x86_64/traps.c
|
||||
@@ -169,6 +169,8 @@ void show_page_walk(unsigned long addr)
|
||||
l1_pgentry_t l1e, *l1t;
|
||||
|
||||
printk("Pagetable walk from %016lx:\n", addr);
|
||||
+ if ( !is_canonical_address(addr) )
|
||||
+ return;
|
||||
|
||||
l4t = map_domain_page(mfn);
|
||||
l4e = l4t[l4_table_offset(addr)];
|
@ -0,0 +1,46 @@
|
||||
# Commit e47a90e6dca491c0ceea6ffa18055e7e32565e8e
|
||||
# Date 2013-10-21 17:26:16 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/xsave: also save/restore XCR0 across suspend (ACPI S3)
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/acpi/suspend.c
|
||||
+++ b/xen/arch/x86/acpi/suspend.c
|
||||
@@ -13,12 +13,14 @@
|
||||
#include <asm/hvm/hvm.h>
|
||||
#include <asm/hvm/support.h>
|
||||
#include <asm/i387.h>
|
||||
+#include <asm/xstate.h>
|
||||
#include <xen/hypercall.h>
|
||||
|
||||
static unsigned long saved_lstar, saved_cstar;
|
||||
static unsigned long saved_sysenter_esp, saved_sysenter_eip;
|
||||
static unsigned long saved_fs_base, saved_gs_base, saved_kernel_gs_base;
|
||||
static uint16_t saved_segs[4];
|
||||
+static uint64_t saved_xcr0;
|
||||
|
||||
void save_rest_processor_state(void)
|
||||
{
|
||||
@@ -38,6 +40,8 @@ void save_rest_processor_state(void)
|
||||
rdmsrl(MSR_IA32_SYSENTER_ESP, saved_sysenter_esp);
|
||||
rdmsrl(MSR_IA32_SYSENTER_EIP, saved_sysenter_eip);
|
||||
}
|
||||
+ if ( cpu_has_xsave )
|
||||
+ saved_xcr0 = get_xcr0();
|
||||
}
|
||||
|
||||
|
||||
@@ -77,6 +81,9 @@ void restore_rest_processor_state(void)
|
||||
do_set_segment_base(SEGBASE_GS_USER_SEL, saved_segs[3]);
|
||||
}
|
||||
|
||||
+ if ( cpu_has_xsave && !set_xcr0(saved_xcr0) )
|
||||
+ BUG();
|
||||
+
|
||||
/* Maybe load the debug registers. */
|
||||
BUG_ON(is_hvm_vcpu(curr));
|
||||
if ( !is_idle_vcpu(curr) && curr->arch.debugreg[7] )
|
@ -0,0 +1,62 @@
|
||||
# Commit 343cad8c70585c4dba8afc75e1ec1b7610605ab2
|
||||
# Date 2013-10-28 12:00:36 +0100
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: refine address validity checks before accessing page tables
|
||||
|
||||
In commit 40d66baa ("x86: correct LDT checks") and d06a0d71 ("x86: add
|
||||
address validity check to guest_map_l1e()") I didn't really pay
|
||||
attention to the fact that these checks would better be done before the
|
||||
paging_mode_translate() ones, as there's also no equivalent check down
|
||||
the shadow code paths involved here (at least not up to the first use
|
||||
of the address), and such generic checks shouldn't really be done by
|
||||
particular backend functions anyway.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Tim Deegan <tim@xen.org>
|
||||
|
||||
--- a/xen/include/asm-x86/paging.h
|
||||
+++ b/xen/include/asm-x86/paging.h
|
||||
@@ -356,12 +356,14 @@ guest_map_l1e(struct vcpu *v, unsigned l
|
||||
{
|
||||
l2_pgentry_t l2e;
|
||||
|
||||
+ if ( unlikely(!__addr_ok(addr)) )
|
||||
+ return NULL;
|
||||
+
|
||||
if ( unlikely(paging_mode_translate(v->domain)) )
|
||||
return paging_get_hostmode(v)->guest_map_l1e(v, addr, gl1mfn);
|
||||
|
||||
/* Find this l1e and its enclosing l1mfn in the linear map */
|
||||
- if ( !__addr_ok(addr) ||
|
||||
- __copy_from_user(&l2e,
|
||||
+ if ( __copy_from_user(&l2e,
|
||||
&__linear_l2_table[l2_linear_offset(addr)],
|
||||
sizeof(l2_pgentry_t)) != 0 )
|
||||
return NULL;
|
||||
@@ -382,16 +384,21 @@ guest_unmap_l1e(struct vcpu *v, void *p)
|
||||
|
||||
/* Read the guest's l1e that maps this address. */
|
||||
static inline void
|
||||
-guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
|
||||
+guest_get_eff_l1e(struct vcpu *v, unsigned long addr, l1_pgentry_t *eff_l1e)
|
||||
{
|
||||
+ if ( unlikely(!__addr_ok(addr)) )
|
||||
+ {
|
||||
+ *eff_l1e = l1e_empty();
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
if ( likely(!paging_mode_translate(v->domain)) )
|
||||
{
|
||||
ASSERT(!paging_mode_external(v->domain));
|
||||
- if ( !__addr_ok(addr) ||
|
||||
- __copy_from_user(eff_l1e,
|
||||
+ if ( __copy_from_user(eff_l1e,
|
||||
&__linear_l1_table[l1_linear_offset(addr)],
|
||||
sizeof(l1_pgentry_t)) != 0 )
|
||||
- *(l1_pgentry_t *)eff_l1e = l1e_empty();
|
||||
+ *eff_l1e = l1e_empty();
|
||||
return;
|
||||
}
|
||||
|
74
526f786a-fix-locking-in-cpu_disable_scheduler.patch
Normal file
74
526f786a-fix-locking-in-cpu_disable_scheduler.patch
Normal file
@ -0,0 +1,74 @@
|
||||
# Commit 41a0cc9e26160a89245c9ba3233e3f70bf9cd4b4
|
||||
# Date 2013-10-29 09:57:14 +0100
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
fix locking in cpu_disable_scheduler()
|
||||
|
||||
So commit eedd6039 ("scheduler: adjust internal locking interface")
|
||||
uncovered - by now using proper spin lock constructs - a bug after all:
|
||||
When bringing down a CPU, cpu_disable_scheduler() gets called with
|
||||
interrupts disabled, and hence the use of vcpu_schedule_lock_irq() was
|
||||
never really correct (i.e. the caller ended up with interrupts enabled
|
||||
despite having disabled them explicitly).
|
||||
|
||||
Fixing this however surfaced another problem: The call path
|
||||
vcpu_migrate() -> evtchn_move_pirqs() wants to acquire the event lock,
|
||||
which however is a non-IRQ-safe once, and hence check_lock() doesn't
|
||||
like this lock to be acquired when interrupts are already off. As we're
|
||||
in stop-machine context here, getting things wrong wrt interrupt state
|
||||
management during lock acquire/release is out of question though, so
|
||||
the simple solution to this appears to be to just suppress spin lock
|
||||
debugging for the period of time while the stop machine callback gets
|
||||
run.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/common/schedule.c
|
||||
+++ b/xen/common/schedule.c
|
||||
@@ -600,7 +600,8 @@ int cpu_disable_scheduler(unsigned int c
|
||||
{
|
||||
for_each_vcpu ( d, v )
|
||||
{
|
||||
- spinlock_t *lock = vcpu_schedule_lock_irq(v);
|
||||
+ unsigned long flags;
|
||||
+ spinlock_t *lock = vcpu_schedule_lock_irqsave(v, &flags);
|
||||
|
||||
cpumask_and(&online_affinity, v->cpu_affinity, c->cpu_valid);
|
||||
if ( cpumask_empty(&online_affinity) &&
|
||||
@@ -621,14 +622,12 @@ int cpu_disable_scheduler(unsigned int c
|
||||
if ( v->processor == cpu )
|
||||
{
|
||||
set_bit(_VPF_migrating, &v->pause_flags);
|
||||
- vcpu_schedule_unlock_irq(lock, v);
|
||||
+ vcpu_schedule_unlock_irqrestore(lock, flags, v);
|
||||
vcpu_sleep_nosync(v);
|
||||
vcpu_migrate(v);
|
||||
}
|
||||
else
|
||||
- {
|
||||
- vcpu_schedule_unlock_irq(lock, v);
|
||||
- }
|
||||
+ vcpu_schedule_unlock_irqrestore(lock, flags, v);
|
||||
|
||||
/*
|
||||
* A vcpu active in the hypervisor will not be migratable.
|
||||
--- a/xen/common/stop_machine.c
|
||||
+++ b/xen/common/stop_machine.c
|
||||
@@ -110,6 +110,7 @@ int stop_machine_run(int (*fn)(void *),
|
||||
local_irq_disable();
|
||||
stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
|
||||
stopmachine_wait_state();
|
||||
+ spin_debug_disable();
|
||||
|
||||
stopmachine_set_state(STOPMACHINE_INVOKE);
|
||||
if ( (cpu == smp_processor_id()) || (cpu == NR_CPUS) )
|
||||
@@ -117,6 +118,7 @@ int stop_machine_run(int (*fn)(void *),
|
||||
stopmachine_wait_state();
|
||||
ret = stopmachine_data.fn_result;
|
||||
|
||||
+ spin_debug_enable();
|
||||
stopmachine_set_state(STOPMACHINE_EXIT);
|
||||
stopmachine_wait_state();
|
||||
local_irq_enable();
|
101
5277639c-gnttab-correct-locking-order-reversal.patch
Normal file
101
5277639c-gnttab-correct-locking-order-reversal.patch
Normal file
@ -0,0 +1,101 @@
|
||||
References: bnc#848657 CVE-2013-4494 XSA-73
|
||||
|
||||
# HG changeset patch
|
||||
# User Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
# Date 1383556439 -3600
|
||||
# Node ID f63cb4c06a991a69b0f11789c88ef069eb39f64c
|
||||
# Parent c30539bc5b235c9ce657f483c2305212ad1cdfba
|
||||
gnttab: correct locking order reversal
|
||||
|
||||
Coverity ID 1087189
|
||||
|
||||
Correct a lock order reversal between a domains page allocation and grant
|
||||
table locks.
|
||||
|
||||
This is CVE-2013-4494 / XSA-73.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
Consolidate error handling.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Keir Fraser <keir@xen.org>
|
||||
Tested-by: Matthew Daley <mattjd@gmail.com>
|
||||
|
||||
--- a/xen/common/grant_table.c
|
||||
+++ b/xen/common/grant_table.c
|
||||
@@ -1518,6 +1518,8 @@ gnttab_transfer(
|
||||
|
||||
for ( i = 0; i < count; i++ )
|
||||
{
|
||||
+ bool_t okay;
|
||||
+
|
||||
if (i && hypercall_preempt_check())
|
||||
return i;
|
||||
|
||||
@@ -1626,16 +1628,18 @@ gnttab_transfer(
|
||||
* pages when it is dying.
|
||||
*/
|
||||
if ( unlikely(e->is_dying) ||
|
||||
- unlikely(e->tot_pages >= e->max_pages) ||
|
||||
- unlikely(!gnttab_prepare_for_transfer(e, d, gop.ref)) )
|
||||
+ unlikely(e->tot_pages >= e->max_pages) )
|
||||
{
|
||||
- if ( !e->is_dying )
|
||||
- gdprintk(XENLOG_INFO, "gnttab_transfer: "
|
||||
- "Transferee has no reservation "
|
||||
- "headroom (%d,%d) or provided a bad grant ref (%08x) "
|
||||
- "or is dying (%d)\n",
|
||||
- e->tot_pages, e->max_pages, gop.ref, e->is_dying);
|
||||
spin_unlock(&e->page_alloc_lock);
|
||||
+
|
||||
+ if ( e->is_dying )
|
||||
+ gdprintk(XENLOG_INFO, "gnttab_transfer: "
|
||||
+ "Transferee (d%d) is dying\n", e->domain_id);
|
||||
+ else
|
||||
+ gdprintk(XENLOG_INFO, "gnttab_transfer: "
|
||||
+ "Transferee (d%d) has no headroom (tot %u, max %u)\n",
|
||||
+ e->domain_id, e->tot_pages, e->max_pages);
|
||||
+
|
||||
rcu_unlock_domain(e);
|
||||
put_gfn(d, gop.mfn);
|
||||
page->count_info &= ~(PGC_count_mask|PGC_allocated);
|
||||
@@ -1647,6 +1651,38 @@ gnttab_transfer(
|
||||
/* Okay, add the page to 'e'. */
|
||||
if ( unlikely(domain_adjust_tot_pages(e, 1) == 1) )
|
||||
get_knownalive_domain(e);
|
||||
+
|
||||
+ /*
|
||||
+ * We must drop the lock to avoid a possible deadlock in
|
||||
+ * gnttab_prepare_for_transfer. We have reserved a page in e so can
|
||||
+ * safely drop the lock and re-aquire it later to add page to the
|
||||
+ * pagelist.
|
||||
+ */
|
||||
+ spin_unlock(&e->page_alloc_lock);
|
||||
+ okay = gnttab_prepare_for_transfer(e, d, gop.ref);
|
||||
+ spin_lock(&e->page_alloc_lock);
|
||||
+
|
||||
+ if ( unlikely(!okay) || unlikely(e->is_dying) )
|
||||
+ {
|
||||
+ bool_t drop_dom_ref = !domain_adjust_tot_pages(e, -1);
|
||||
+
|
||||
+ spin_unlock(&e->page_alloc_lock);
|
||||
+
|
||||
+ if ( okay /* i.e. e->is_dying due to the surrounding if() */ )
|
||||
+ gdprintk(XENLOG_INFO, "gnttab_transfer: "
|
||||
+ "Transferee (d%d) is now dying\n", e->domain_id);
|
||||
+
|
||||
+ if ( drop_dom_ref )
|
||||
+ put_domain(e);
|
||||
+ rcu_unlock_domain(e);
|
||||
+
|
||||
+ put_gfn(d, gop.mfn);
|
||||
+ page->count_info &= ~(PGC_count_mask|PGC_allocated);
|
||||
+ free_domheap_page(page);
|
||||
+ gop.status = GNTST_general_error;
|
||||
+ goto copyback;
|
||||
+ }
|
||||
+
|
||||
page_list_add_tail(page, &e->page_list);
|
||||
page_set_owner(page, e);
|
||||
|
@ -0,0 +1,39 @@
|
||||
References: bnc#848014
|
||||
|
||||
# Commit 2c24cdcce3269f3286790c63821951a1de93c66a
|
||||
# Date 2013-11-04 10:10:04 +0100
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/ACPI/x2APIC: guard against out of range ACPI or APIC IDs
|
||||
|
||||
Other than for the legacy APIC, the x2APIC MADT entries have valid
|
||||
ranges possibly extending beyond what our internal arrays can handle,
|
||||
and hence we need to guard ourselves against corrupting memory here.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/acpi/boot.c
|
||||
+++ b/xen/arch/x86/acpi/boot.c
|
||||
@@ -97,7 +97,20 @@ acpi_parse_x2apic(struct acpi_subtable_h
|
||||
|
||||
acpi_table_print_madt_entry(header);
|
||||
|
||||
- /* Record local apic id only when enabled */
|
||||
+ /* Record local apic id only when enabled and fitting. */
|
||||
+ if (processor->local_apic_id >= MAX_APICS ||
|
||||
+ processor->uid >= MAX_MADT_ENTRIES) {
|
||||
+ printk("%sAPIC ID %#x and/or ACPI ID %#x beyond limit"
|
||||
+ " - processor ignored\n",
|
||||
+ processor->lapic_flags & ACPI_MADT_ENABLED ?
|
||||
+ KERN_WARNING "WARNING: " : KERN_INFO,
|
||||
+ processor->local_apic_id, processor->uid);
|
||||
+ /*
|
||||
+ * Must not return an error here, to prevent
|
||||
+ * acpi_table_parse_entries() from terminating early.
|
||||
+ */
|
||||
+ return 0 /* -ENOSPC */;
|
||||
+ }
|
||||
if (processor->lapic_flags & ACPI_MADT_ENABLED) {
|
||||
x86_acpiid_to_apicid[processor->uid] =
|
||||
processor->local_apic_id;
|
@ -0,0 +1,27 @@
|
||||
References: bnc#842417
|
||||
|
||||
# Commit 227258983401b7e6091967ffaf22ad83f4ebaf6f
|
||||
# Date 2013-11-04 14:29:24 +0100
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: make sure memory block is RAM before passing to the allocator
|
||||
|
||||
Memory blocks outside of the always visible 1:1 mapping range get
|
||||
passed to the allocator separately (once enough other setup was done).
|
||||
Skipping non-RAM regions, however, was forgotten in adc5afbf ("x86:
|
||||
support up to 16Tb").
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/setup.c
|
||||
+++ b/xen/arch/x86/setup.c
|
||||
@@ -1154,6 +1154,8 @@ void __init __start_xen(unsigned long mb
|
||||
{
|
||||
uint64_t s, e;
|
||||
|
||||
+ if ( boot_e820.map[i].type != E820_RAM )
|
||||
+ continue;
|
||||
s = (boot_e820.map[i].addr + mask) & ~mask;
|
||||
e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
|
||||
if ( PFN_DOWN(e) <= limit )
|
@ -0,0 +1,82 @@
|
||||
# Commit 9d89100ba8b7b02adb7c2e89ef7c81e734942e7c
|
||||
# Date 2013-11-05 14:51:53 +0100
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/HVM: 32-bit IN result must be zero-extended to 64 bits
|
||||
|
||||
Just like for all other operations with 32-bit operand size.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
# Commit 1e521eddeb51a9f1bf0e4dd1d17efc873eafae41
|
||||
# Date 2013-11-15 11:01:49 +0100
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/HVM: 32-bit IN result must be zero-extended to 64 bits (part 2)
|
||||
|
||||
Just spotted a counterpart of what commit 9d89100b (same title) dealt
|
||||
with.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/hvm/io.c
|
||||
+++ b/xen/arch/x86/hvm/io.c
|
||||
@@ -221,13 +221,15 @@ int handle_mmio_with_translation(unsigne
|
||||
return handle_mmio();
|
||||
}
|
||||
|
||||
-int handle_pio(uint16_t port, int size, int dir)
|
||||
+int handle_pio(uint16_t port, unsigned int size, int dir)
|
||||
{
|
||||
struct vcpu *curr = current;
|
||||
struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
|
||||
unsigned long data, reps = 1;
|
||||
int rc;
|
||||
|
||||
+ ASSERT((size - 1) < 4 && size != 3);
|
||||
+
|
||||
if ( dir == IOREQ_WRITE )
|
||||
data = guest_cpu_user_regs()->eax;
|
||||
|
||||
@@ -237,7 +239,12 @@ int handle_pio(uint16_t port, int size,
|
||||
{
|
||||
case X86EMUL_OKAY:
|
||||
if ( dir == IOREQ_READ )
|
||||
- memcpy(&guest_cpu_user_regs()->eax, &data, vio->io_size);
|
||||
+ {
|
||||
+ if ( size == 4 ) /* Needs zero extension. */
|
||||
+ guest_cpu_user_regs()->rax = (uint32_t)data;
|
||||
+ else
|
||||
+ memcpy(&guest_cpu_user_regs()->rax, &data, size);
|
||||
+ }
|
||||
break;
|
||||
case X86EMUL_RETRY:
|
||||
if ( vio->io_state != HVMIO_awaiting_completion )
|
||||
@@ -281,8 +288,10 @@ void hvm_io_assist(void)
|
||||
(void)handle_mmio();
|
||||
break;
|
||||
case HVMIO_handle_pio_awaiting_completion:
|
||||
- memcpy(&guest_cpu_user_regs()->eax,
|
||||
- &p->data, vio->io_size);
|
||||
+ if ( vio->io_size == 4 ) /* Needs zero extension. */
|
||||
+ guest_cpu_user_regs()->rax = (uint32_t)p->data;
|
||||
+ else
|
||||
+ memcpy(&guest_cpu_user_regs()->rax, &p->data, vio->io_size);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
--- a/xen/include/asm-x86/hvm/io.h
|
||||
+++ b/xen/include/asm-x86/hvm/io.h
|
||||
@@ -119,7 +119,7 @@ void send_timeoffset_req(unsigned long t
|
||||
void send_invalidate_req(void);
|
||||
int handle_mmio(void);
|
||||
int handle_mmio_with_translation(unsigned long gva, unsigned long gpfn);
|
||||
-int handle_pio(uint16_t port, int size, int dir);
|
||||
+int handle_pio(uint16_t port, unsigned int size, int dir);
|
||||
void hvm_interrupt_post(struct vcpu *v, int vector, int type);
|
||||
void hvm_io_assist(void);
|
||||
void hvm_dpci_eoi(struct domain *d, unsigned int guest_irq,
|
@ -0,0 +1,38 @@
|
||||
# Commit 117f67350fd18b11ab09d628b4edea3364b09441
|
||||
# Date 2013-11-06 10:21:09 +0100
|
||||
# Author Nathan Studer <nate.studer@dornerworks.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
call sched_destroy_domain before cpupool_rm_domain
|
||||
|
||||
The domain destruction code, removes a domain from its cpupool
|
||||
before attempting to destroy its scheduler information. Since
|
||||
the scheduler framework uses the domain's cpupool information
|
||||
to decide on which scheduler ops to use, this results in the
|
||||
the wrong scheduler's destroy domain function being called
|
||||
when the cpupool scheduler and the initial scheduler are
|
||||
different.
|
||||
|
||||
Correct this by destroying the domain's scheduling information
|
||||
before removing it from the pool.
|
||||
|
||||
Signed-off-by: Nathan Studer <nate.studer@dornerworks.com>
|
||||
Reviewed-by: Juergen Gross <juergen.gross@ts.fujitsu.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: George Dunlap <george.dunlap@eu.citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/common/domain.c
|
||||
+++ b/xen/common/domain.c
|
||||
@@ -720,10 +720,10 @@ static void complete_domain_destroy(stru
|
||||
|
||||
rangeset_domain_destroy(d);
|
||||
|
||||
- cpupool_rm_domain(d);
|
||||
-
|
||||
sched_destroy_domain(d);
|
||||
|
||||
+ cpupool_rm_domain(d);
|
||||
+
|
||||
/* Free page used by xen oprofile buffer. */
|
||||
#ifdef CONFIG_XENOPROF
|
||||
free_xenoprof_pages(d);
|
@ -0,0 +1,29 @@
|
||||
# Commit 48535f5798e3e237d9920a74c1ce3802958136c0
|
||||
# Date 2013-11-08 11:07:14 +0100
|
||||
# Author Kouya Shimura <kouya@jp.fujitsu.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/hvm: fix restart of RTC periodic timer with vpt_align=1
|
||||
|
||||
The commit 58afa7ef "x86/hvm: Run the RTC periodic timer on a
|
||||
consistent time series" aligns the RTC periodic timer to the VM's boot time.
|
||||
However, it's aligned later again to the system time in create_periodic_time()
|
||||
with vpt_align=1. The next tick might be skipped.
|
||||
|
||||
Signed-off-by: Kouya Shimura <kouya@jp.fujitsu.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Tim Deegan <tim@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/hvm/rtc.c
|
||||
+++ b/xen/arch/x86/hvm/rtc.c
|
||||
@@ -130,7 +130,10 @@ static void rtc_timer_update(RTCState *s
|
||||
s->pt_code = period_code;
|
||||
period = 1 << (period_code - 1); /* period in 32 Khz cycles */
|
||||
period = DIV_ROUND(period * 1000000000ULL, 32768); /* in ns */
|
||||
- delta = period - ((NOW() - s->start_time) % period);
|
||||
+ if ( v->domain->arch.hvm_domain.params[HVM_PARAM_VPT_ALIGN] )
|
||||
+ delta = 0;
|
||||
+ else
|
||||
+ delta = period - ((NOW() - s->start_time) % period);
|
||||
create_periodic_time(v, &s->pt, delta, period,
|
||||
RTC_IRQ, NULL, s);
|
||||
}
|
107
527cb820-x86-EFI-make-trampoline-allocation-more-flexible.patch
Normal file
107
527cb820-x86-EFI-make-trampoline-allocation-more-flexible.patch
Normal file
@ -0,0 +1,107 @@
|
||||
References: bnc#833483
|
||||
|
||||
# Commit c1f2dfe8f6a559bc28935f24e31bb33d17d9713d
|
||||
# Date 2013-11-08 11:08:32 +0100
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/EFI: make trampoline allocation more flexible
|
||||
|
||||
Certain UEFI implementations reserve all memory below 1Mb at boot time,
|
||||
making it impossible to properly allocate the chunk necessary for the
|
||||
trampoline. Fall back to simply grabbing a chunk from EfiBootServices*
|
||||
regions immediately prior to calling ExitBootServices().
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/efi/boot.c
|
||||
+++ b/xen/arch/x86/efi/boot.c
|
||||
@@ -746,6 +746,22 @@ static void __init relocate_image(unsign
|
||||
extern const s32 __trampoline_rel_start[], __trampoline_rel_stop[];
|
||||
extern const s32 __trampoline_seg_start[], __trampoline_seg_stop[];
|
||||
|
||||
+static void __init relocate_trampoline(unsigned long phys)
|
||||
+{
|
||||
+ const s32 *trampoline_ptr;
|
||||
+
|
||||
+ trampoline_phys = phys;
|
||||
+ /* Apply relocations to trampoline. */
|
||||
+ for ( trampoline_ptr = __trampoline_rel_start;
|
||||
+ trampoline_ptr < __trampoline_rel_stop;
|
||||
+ ++trampoline_ptr )
|
||||
+ *(u32 *)(*trampoline_ptr + (long)trampoline_ptr) += phys;
|
||||
+ for ( trampoline_ptr = __trampoline_seg_start;
|
||||
+ trampoline_ptr < __trampoline_seg_stop;
|
||||
+ ++trampoline_ptr )
|
||||
+ *(u16 *)(*trampoline_ptr + (long)trampoline_ptr) = phys >> 4;
|
||||
+}
|
||||
+
|
||||
void EFIAPI __init __attribute__((__noreturn__))
|
||||
efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable)
|
||||
{
|
||||
@@ -765,7 +781,6 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SY
|
||||
EFI_GRAPHICS_OUTPUT_MODE_INFORMATION *mode_info;
|
||||
EFI_FILE_HANDLE dir_handle;
|
||||
union string section = { NULL }, name;
|
||||
- const s32 *trampoline_ptr;
|
||||
struct e820entry *e;
|
||||
u64 efer;
|
||||
bool_t base_video = 0;
|
||||
@@ -1268,23 +1283,13 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SY
|
||||
cfg.size = trampoline_end - trampoline_start;
|
||||
status = efi_bs->AllocatePages(AllocateMaxAddress, EfiLoaderData,
|
||||
PFN_UP(cfg.size), &cfg.addr);
|
||||
- if ( EFI_ERROR(status) )
|
||||
+ if ( status == EFI_SUCCESS )
|
||||
+ relocate_trampoline(cfg.addr);
|
||||
+ else
|
||||
{
|
||||
cfg.addr = 0;
|
||||
- blexit(L"No memory for trampoline\r\n");
|
||||
+ PrintStr(L"Trampoline space cannot be allocated; will try fallback.\r\n");
|
||||
}
|
||||
- trampoline_phys = cfg.addr;
|
||||
- /* Apply relocations to trampoline. */
|
||||
- for ( trampoline_ptr = __trampoline_rel_start;
|
||||
- trampoline_ptr < __trampoline_rel_stop;
|
||||
- ++trampoline_ptr )
|
||||
- *(u32 *)(*trampoline_ptr + (long)trampoline_ptr) +=
|
||||
- trampoline_phys;
|
||||
- for ( trampoline_ptr = __trampoline_seg_start;
|
||||
- trampoline_ptr < __trampoline_seg_stop;
|
||||
- ++trampoline_ptr )
|
||||
- *(u16 *)(*trampoline_ptr + (long)trampoline_ptr) =
|
||||
- trampoline_phys >> 4;
|
||||
|
||||
/* Initialise L2 identity-map and boot-map page table entries (16MB). */
|
||||
for ( i = 0; i < 8; ++i )
|
||||
@@ -1400,10 +1405,14 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SY
|
||||
type = E820_RESERVED;
|
||||
break;
|
||||
case EfiConventionalMemory:
|
||||
- case EfiLoaderCode:
|
||||
- case EfiLoaderData:
|
||||
case EfiBootServicesCode:
|
||||
case EfiBootServicesData:
|
||||
+ if ( !trampoline_phys && desc->PhysicalStart + len <= 0x100000 &&
|
||||
+ len >= cfg.size && desc->PhysicalStart + len > cfg.addr )
|
||||
+ cfg.addr = (desc->PhysicalStart + len - cfg.size) & PAGE_MASK;
|
||||
+ /* fall through */
|
||||
+ case EfiLoaderCode:
|
||||
+ case EfiLoaderData:
|
||||
if ( desc->Attribute & EFI_MEMORY_WB )
|
||||
type = E820_RAM;
|
||||
else
|
||||
@@ -1431,6 +1440,12 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SY
|
||||
++e820nr;
|
||||
}
|
||||
}
|
||||
+ if ( !trampoline_phys )
|
||||
+ {
|
||||
+ if ( !cfg.addr )
|
||||
+ blexit(L"No memory for trampoline");
|
||||
+ relocate_trampoline(cfg.addr);
|
||||
+ }
|
||||
|
||||
status = efi_bs->ExitBootServices(ImageHandle, map_key);
|
||||
if ( EFI_ERROR(status) )
|
@ -0,0 +1,62 @@
|
||||
References: bnc#849665 CVE-2013-4551 XSA-75
|
||||
|
||||
# Commit 4e87bc5b03e05123ba5c888f77969140c8ebd1bf
|
||||
# Date 2013-11-11 09:15:04 +0100
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
nested VMX: VMLANUCH/VMRESUME emulation must check permission first thing
|
||||
|
||||
Otherwise uninitialized data may be used, leading to crashes.
|
||||
|
||||
This is CVE-2013-4551 / XSA-75.
|
||||
|
||||
Reported-and-tested-by: Jeff Zimmerman <Jeff_Zimmerman@McAfee.com>
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-and-tested-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/vmx/vvmx.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
|
||||
@@ -1508,15 +1508,10 @@ static void clear_vvmcs_launched(struct
|
||||
}
|
||||
}
|
||||
|
||||
-int nvmx_vmresume(struct vcpu *v, struct cpu_user_regs *regs)
|
||||
+static int nvmx_vmresume(struct vcpu *v, struct cpu_user_regs *regs)
|
||||
{
|
||||
struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
|
||||
struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
|
||||
- int rc;
|
||||
-
|
||||
- rc = vmx_inst_check_privilege(regs, 0);
|
||||
- if ( rc != X86EMUL_OKAY )
|
||||
- return rc;
|
||||
|
||||
/* check VMCS is valid and IO BITMAP is set */
|
||||
if ( (nvcpu->nv_vvmcxaddr != VMCX_EADDR) &&
|
||||
@@ -1535,6 +1530,10 @@ int nvmx_handle_vmresume(struct cpu_user
|
||||
struct vcpu *v = current;
|
||||
struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
|
||||
struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
|
||||
+ int rc = vmx_inst_check_privilege(regs, 0);
|
||||
+
|
||||
+ if ( rc != X86EMUL_OKAY )
|
||||
+ return rc;
|
||||
|
||||
if ( vcpu_nestedhvm(v).nv_vvmcxaddr == VMCX_EADDR )
|
||||
{
|
||||
@@ -1554,10 +1553,13 @@ int nvmx_handle_vmresume(struct cpu_user
|
||||
int nvmx_handle_vmlaunch(struct cpu_user_regs *regs)
|
||||
{
|
||||
bool_t launched;
|
||||
- int rc;
|
||||
struct vcpu *v = current;
|
||||
struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
|
||||
struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
|
||||
+ int rc = vmx_inst_check_privilege(regs, 0);
|
||||
+
|
||||
+ if ( rc != X86EMUL_OKAY )
|
||||
+ return rc;
|
||||
|
||||
if ( vcpu_nestedhvm(v).nv_vvmcxaddr == VMCX_EADDR )
|
||||
{
|
@ -0,0 +1,101 @@
|
||||
References: bnc#842417
|
||||
|
||||
# Commit 178fd279dc138243b514b4ecd48509e4bf5d1ede
|
||||
# Date 2013-11-11 11:01:04 +0100
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/idle: reduce contention on ACPI register accesses
|
||||
|
||||
Other than when they're located in I/O port space, accessing them when
|
||||
in MMIO space (currently) implies usage of some sort of global lock: In
|
||||
-unstable this would be due to the use of vmap(), is older trees the
|
||||
necessary locking was introduced by 2ee9cbf9 ("ACPI: fix
|
||||
acpi_os_map_memory()"). This contention was observed to result in Dom0
|
||||
kernel soft lockups during the loading of the ACPI processor driver
|
||||
there on systems with very many CPU cores.
|
||||
|
||||
There are a couple of things being done for this:
|
||||
- re-order elements of an if() condition so that the register access
|
||||
only happens when we really need it
|
||||
- turn off arbitration disabling only when the first CPU leaves C3
|
||||
(paralleling how arbitration disabling gets turned on)
|
||||
- only set the (global) bus master reload flag once (when the first
|
||||
target CPU gets processed)
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/acpi/cpu_idle.c
|
||||
+++ b/xen/arch/x86/acpi/cpu_idle.c
|
||||
@@ -439,8 +439,8 @@ static void acpi_processor_idle(void)
|
||||
(next_state = cpuidle_current_governor->select(power)) > 0 )
|
||||
{
|
||||
cx = &power->states[next_state];
|
||||
- if ( power->flags.bm_check && acpi_idle_bm_check()
|
||||
- && cx->type == ACPI_STATE_C3 )
|
||||
+ if ( cx->type == ACPI_STATE_C3 && power->flags.bm_check &&
|
||||
+ acpi_idle_bm_check() )
|
||||
cx = power->safe_state;
|
||||
if ( cx->idx > max_cstate )
|
||||
cx = &power->states[max_cstate];
|
||||
@@ -563,8 +563,8 @@ static void acpi_processor_idle(void)
|
||||
{
|
||||
/* Enable bus master arbitration */
|
||||
spin_lock(&c3_cpu_status.lock);
|
||||
- acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
|
||||
- c3_cpu_status.count--;
|
||||
+ if ( c3_cpu_status.count-- == num_online_cpus() )
|
||||
+ acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
|
||||
spin_unlock(&c3_cpu_status.lock);
|
||||
}
|
||||
|
||||
@@ -821,12 +821,10 @@ static int check_cx(struct acpi_processo
|
||||
return -EINVAL;
|
||||
|
||||
/* All the logic here assumes flags.bm_check is same across all CPUs */
|
||||
- if ( bm_check_flag == -1 )
|
||||
+ if ( bm_check_flag < 0 )
|
||||
{
|
||||
/* Determine whether bm_check is needed based on CPU */
|
||||
acpi_processor_power_init_bm_check(&(power->flags));
|
||||
- bm_check_flag = power->flags.bm_check;
|
||||
- bm_control_flag = power->flags.bm_control;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -853,14 +851,13 @@ static int check_cx(struct acpi_processo
|
||||
}
|
||||
}
|
||||
/*
|
||||
- * On older chipsets, BM_RLD needs to be set
|
||||
- * in order for Bus Master activity to wake the
|
||||
- * system from C3. Newer chipsets handle DMA
|
||||
- * during C3 automatically and BM_RLD is a NOP.
|
||||
- * In either case, the proper way to
|
||||
- * handle BM_RLD is to set it and leave it set.
|
||||
+ * On older chipsets, BM_RLD needs to be set in order for Bus
|
||||
+ * Master activity to wake the system from C3, hence
|
||||
+ * acpi_set_register() is always being called once below. Newer
|
||||
+ * chipsets handle DMA during C3 automatically and BM_RLD is a
|
||||
+ * NOP. In either case, the proper way to handle BM_RLD is to
|
||||
+ * set it and leave it set.
|
||||
*/
|
||||
- acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -875,7 +872,13 @@ static int check_cx(struct acpi_processo
|
||||
" for C3 to be enabled on SMP systems\n"));
|
||||
return -EINVAL;
|
||||
}
|
||||
- acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
|
||||
+ }
|
||||
+
|
||||
+ if ( bm_check_flag < 0 )
|
||||
+ {
|
||||
+ bm_check_flag = power->flags.bm_check;
|
||||
+ bm_control_flag = power->flags.bm_control;
|
||||
+ acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, bm_check_flag);
|
||||
}
|
||||
|
||||
break;
|
@ -0,0 +1,75 @@
|
||||
# Commit 67348c3ac700b8bc9147638c719c3035c5ef20f5
|
||||
# Date 2013-11-12 10:54:28 +0100
|
||||
# Author Dario Faggioli <dario.faggioli@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
numa-sched: leave node-affinity alone if not in "auto" mode
|
||||
|
||||
If the domain's NUMA node-affinity is being specified by the
|
||||
user/toolstack (instead of being automatically computed by Xen),
|
||||
we really should stick to that. This means domain_update_node_affinity()
|
||||
is wrong when it filters out some stuff from there even in "!auto"
|
||||
mode.
|
||||
|
||||
This commit fixes that. Of course, this does not mean node-affinity
|
||||
is always honoured (e.g., a vcpu won't run on a pcpu of a different
|
||||
cpupool) but the necessary logic for taking into account all the
|
||||
possible situations lives in the scheduler code, where it belongs.
|
||||
|
||||
What could happen without this change is that, under certain
|
||||
circumstances, the node-affinity of a domain may change when the
|
||||
user modifies the vcpu-affinity of the domain's vcpus. This, even
|
||||
if probably not a real bug, is at least something the user does
|
||||
not expect, so let's avoid it.
|
||||
|
||||
Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
|
||||
Reviewed-by: George Dunlap <george.dunlap@eu.citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/common/domain.c
|
||||
+++ b/xen/common/domain.c
|
||||
@@ -345,7 +345,6 @@ void domain_update_node_affinity(struct
|
||||
cpumask_var_t cpumask;
|
||||
cpumask_var_t online_affinity;
|
||||
const cpumask_t *online;
|
||||
- nodemask_t nodemask = NODE_MASK_NONE;
|
||||
struct vcpu *v;
|
||||
unsigned int node;
|
||||
|
||||
@@ -367,28 +366,19 @@ void domain_update_node_affinity(struct
|
||||
cpumask_or(cpumask, cpumask, online_affinity);
|
||||
}
|
||||
|
||||
+ /*
|
||||
+ * If d->auto_node_affinity is true, the domain's node-affinity mask
|
||||
+ * (d->node_affinity) is automaically computed from all the domain's
|
||||
+ * vcpus' vcpu-affinity masks (the union of which we have just built
|
||||
+ * above in cpumask). OTOH, if d->auto_node_affinity is false, we
|
||||
+ * must leave the node-affinity of the domain alone.
|
||||
+ */
|
||||
if ( d->auto_node_affinity )
|
||||
{
|
||||
- /* Node-affinity is automaically computed from all vcpu-affinities */
|
||||
+ nodes_clear(d->node_affinity);
|
||||
for_each_online_node ( node )
|
||||
if ( cpumask_intersects(&node_to_cpumask(node), cpumask) )
|
||||
- node_set(node, nodemask);
|
||||
-
|
||||
- d->node_affinity = nodemask;
|
||||
- }
|
||||
- else
|
||||
- {
|
||||
- /* Node-affinity is provided by someone else, just filter out cpus
|
||||
- * that are either offline or not in the affinity of any vcpus. */
|
||||
- nodemask = d->node_affinity;
|
||||
- for_each_node_mask ( node, d->node_affinity )
|
||||
- if ( !cpumask_intersects(&node_to_cpumask(node), cpumask) )
|
||||
- node_clear(node, nodemask);//d->node_affinity);
|
||||
-
|
||||
- /* Avoid loosing track of node-affinity because of a bad
|
||||
- * vcpu-affinity has been specified. */
|
||||
- if ( !nodes_empty(nodemask) )
|
||||
- d->node_affinity = nodemask;
|
||||
+ node_set(node, d->node_affinity);
|
||||
}
|
||||
|
||||
sched_set_node_affinity(d, &d->node_affinity);
|
@ -0,0 +1,132 @@
|
||||
# Commit b1e87805bf37b446dade93a7eb922bb7d1269756
|
||||
# Date 2013-11-12 11:51:15 +0100
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
nested SVM: adjust guest handling of structure mappings
|
||||
|
||||
For one, nestedsvm_vmcb_map() error checking must not consist of using
|
||||
assertions: Global (permanent) mappings can fail, and hence failure
|
||||
needs to be dealt with properly. And non-global (transient) mappings
|
||||
can't fail anyway.
|
||||
|
||||
And then the I/O port access bitmap handling was broken: It checked
|
||||
only to first of the accessed ports rather than each of them.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Christoph Egger <chegger@amazon.de>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/svm/nestedsvm.c
|
||||
+++ b/xen/arch/x86/hvm/svm/nestedsvm.c
|
||||
@@ -342,7 +342,7 @@ static int nsvm_vmrun_permissionmap(stru
|
||||
unsigned int i;
|
||||
enum hvm_copy_result ret;
|
||||
unsigned long *ns_viomap;
|
||||
- bool_t ioport_80, ioport_ed;
|
||||
+ bool_t ioport_80 = 1, ioport_ed = 1;
|
||||
|
||||
ns_msrpm_ptr = (unsigned long *)svm->ns_cached_msrpm;
|
||||
|
||||
@@ -360,10 +360,12 @@ static int nsvm_vmrun_permissionmap(stru
|
||||
svm->ns_iomap_pa = ns_vmcb->_iopm_base_pa;
|
||||
|
||||
ns_viomap = hvm_map_guest_frame_ro(svm->ns_iomap_pa >> PAGE_SHIFT, 0);
|
||||
- ASSERT(ns_viomap != NULL);
|
||||
- ioport_80 = test_bit(0x80, ns_viomap);
|
||||
- ioport_ed = test_bit(0xed, ns_viomap);
|
||||
- hvm_unmap_guest_frame(ns_viomap, 0);
|
||||
+ if ( ns_viomap )
|
||||
+ {
|
||||
+ ioport_80 = test_bit(0x80, ns_viomap);
|
||||
+ ioport_ed = test_bit(0xed, ns_viomap);
|
||||
+ hvm_unmap_guest_frame(ns_viomap, 0);
|
||||
+ }
|
||||
|
||||
svm->ns_iomap = nestedhvm_vcpu_iomap_get(ioport_80, ioport_ed);
|
||||
|
||||
@@ -866,40 +868,45 @@ nsvm_vmcb_guest_intercepts_msr(unsigned
|
||||
static int
|
||||
nsvm_vmcb_guest_intercepts_ioio(paddr_t iopm_pa, uint64_t exitinfo1)
|
||||
{
|
||||
- unsigned long iopm_gfn = iopm_pa >> PAGE_SHIFT;
|
||||
- unsigned long *io_bitmap = NULL;
|
||||
+ unsigned long gfn = iopm_pa >> PAGE_SHIFT;
|
||||
+ unsigned long *io_bitmap;
|
||||
ioio_info_t ioinfo;
|
||||
uint16_t port;
|
||||
+ unsigned int size;
|
||||
bool_t enabled;
|
||||
- unsigned long gfn = 0; /* gcc ... */
|
||||
|
||||
ioinfo.bytes = exitinfo1;
|
||||
port = ioinfo.fields.port;
|
||||
+ size = ioinfo.fields.sz32 ? 4 : ioinfo.fields.sz16 ? 2 : 1;
|
||||
|
||||
- switch (port) {
|
||||
- case 0 ... 32767: /* first 4KB page */
|
||||
- gfn = iopm_gfn;
|
||||
+ switch ( port )
|
||||
+ {
|
||||
+ case 0 ... 8 * PAGE_SIZE - 1: /* first 4KB page */
|
||||
break;
|
||||
- case 32768 ... 65535: /* second 4KB page */
|
||||
- port -= 32768;
|
||||
- gfn = iopm_gfn + 1;
|
||||
+ case 8 * PAGE_SIZE ... 2 * 8 * PAGE_SIZE - 1: /* second 4KB page */
|
||||
+ port -= 8 * PAGE_SIZE;
|
||||
+ ++gfn;
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
break;
|
||||
}
|
||||
|
||||
- io_bitmap = hvm_map_guest_frame_ro(gfn, 0);
|
||||
- if (io_bitmap == NULL) {
|
||||
- gdprintk(XENLOG_ERR,
|
||||
- "IOIO intercept: mapping of permission map failed\n");
|
||||
- return NESTEDHVM_VMEXIT_ERROR;
|
||||
+ for ( io_bitmap = hvm_map_guest_frame_ro(gfn, 0); ; )
|
||||
+ {
|
||||
+ enabled = io_bitmap && test_bit(port, io_bitmap);
|
||||
+ if ( !enabled || !--size )
|
||||
+ break;
|
||||
+ if ( unlikely(++port == 8 * PAGE_SIZE) )
|
||||
+ {
|
||||
+ hvm_unmap_guest_frame(io_bitmap, 0);
|
||||
+ io_bitmap = hvm_map_guest_frame_ro(++gfn, 0);
|
||||
+ port -= 8 * PAGE_SIZE;
|
||||
+ }
|
||||
}
|
||||
-
|
||||
- enabled = test_bit(port, io_bitmap);
|
||||
hvm_unmap_guest_frame(io_bitmap, 0);
|
||||
|
||||
- if (!enabled)
|
||||
+ if ( !enabled )
|
||||
return NESTEDHVM_VMEXIT_HOST;
|
||||
|
||||
return NESTEDHVM_VMEXIT_INJECT;
|
||||
@@ -966,8 +973,8 @@ nsvm_vmcb_guest_intercepts_exitcode(stru
|
||||
switch (exitcode) {
|
||||
case VMEXIT_MSR:
|
||||
ASSERT(regs != NULL);
|
||||
- nestedsvm_vmcb_map(v, nv->nv_vvmcxaddr);
|
||||
- ASSERT(nv->nv_vvmcx != NULL);
|
||||
+ if ( !nestedsvm_vmcb_map(v, nv->nv_vvmcxaddr) )
|
||||
+ break;
|
||||
ns_vmcb = nv->nv_vvmcx;
|
||||
vmexits = nsvm_vmcb_guest_intercepts_msr(svm->ns_cached_msrpm,
|
||||
regs->ecx, ns_vmcb->exitinfo1 != 0);
|
||||
@@ -975,8 +982,8 @@ nsvm_vmcb_guest_intercepts_exitcode(stru
|
||||
return 0;
|
||||
break;
|
||||
case VMEXIT_IOIO:
|
||||
- nestedsvm_vmcb_map(v, nv->nv_vvmcxaddr);
|
||||
- ASSERT(nv->nv_vvmcx != NULL);
|
||||
+ if ( !nestedsvm_vmcb_map(v, nv->nv_vvmcxaddr) )
|
||||
+ break;
|
||||
ns_vmcb = nv->nv_vvmcx;
|
||||
vmexits = nsvm_vmcb_guest_intercepts_ioio(ns_vmcb->_iopm_base_pa,
|
||||
ns_vmcb->exitinfo1);
|
105
52820863-VMX-don-t-crash-processing-d-debug-key.patch
Normal file
105
52820863-VMX-don-t-crash-processing-d-debug-key.patch
Normal file
@ -0,0 +1,105 @@
|
||||
References: bnc#846849
|
||||
|
||||
# Commit 58929248461ecadce13e92eb5a5d9ef718a7c88e
|
||||
# Date 2013-11-12 11:52:19 +0100
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
VMX: don't crash processing 'd' debug key
|
||||
|
||||
There's a window during scheduling where "current" and the active VMCS
|
||||
may disagree: The former gets set much earlier than the latter. Since
|
||||
both vmx_vmcs_enter() and vmx_vmcs_exit() immediately return when the
|
||||
subject vCPU is "current", accessing VMCS fields would, depending on
|
||||
whether there is any currently active VMCS, either read wrong data, or
|
||||
cause a crash.
|
||||
|
||||
Going forward we might want to consider reducing the window during
|
||||
which vmx_vmcs_enter() might fail (e.g. doing a plain __vmptrld() when
|
||||
v->arch.hvm_vmx.vmcs != this_cpu(current_vmcs) but arch_vmx->active_cpu
|
||||
== -1), but that would add complexities (acquiring and - more
|
||||
importantly - properly dropping v->arch.hvm_vmx.vmcs_lock) that don't
|
||||
look worthwhile adding right now.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/hvm/vmx/vmcs.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
|
||||
@@ -591,16 +591,16 @@ struct foreign_vmcs {
|
||||
};
|
||||
static DEFINE_PER_CPU(struct foreign_vmcs, foreign_vmcs);
|
||||
|
||||
-void vmx_vmcs_enter(struct vcpu *v)
|
||||
+bool_t vmx_vmcs_try_enter(struct vcpu *v)
|
||||
{
|
||||
struct foreign_vmcs *fv;
|
||||
|
||||
/*
|
||||
* NB. We must *always* run an HVM VCPU on its own VMCS, except for
|
||||
- * vmx_vmcs_enter/exit critical regions.
|
||||
+ * vmx_vmcs_enter/exit and scheduling tail critical regions.
|
||||
*/
|
||||
if ( likely(v == current) )
|
||||
- return;
|
||||
+ return v->arch.hvm_vmx.vmcs == this_cpu(current_vmcs);
|
||||
|
||||
fv = &this_cpu(foreign_vmcs);
|
||||
|
||||
@@ -623,6 +623,15 @@ void vmx_vmcs_enter(struct vcpu *v)
|
||||
}
|
||||
|
||||
fv->count++;
|
||||
+
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+void vmx_vmcs_enter(struct vcpu *v)
|
||||
+{
|
||||
+ bool_t okay = vmx_vmcs_try_enter(v);
|
||||
+
|
||||
+ ASSERT(okay);
|
||||
}
|
||||
|
||||
void vmx_vmcs_exit(struct vcpu *v)
|
||||
--- a/xen/arch/x86/hvm/vmx/vmx.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vmx.c
|
||||
@@ -669,7 +669,27 @@ void vmx_get_segment_register(struct vcp
|
||||
{
|
||||
uint32_t attr = 0;
|
||||
|
||||
- vmx_vmcs_enter(v);
|
||||
+ /*
|
||||
+ * We may get here in the context of dump_execstate(), which may have
|
||||
+ * interrupted context switching between setting "current" and
|
||||
+ * vmx_do_resume() reaching the end of vmx_load_vmcs(). That would make
|
||||
+ * all the VMREADs below fail if we don't bail right away.
|
||||
+ */
|
||||
+ if ( unlikely(!vmx_vmcs_try_enter(v)) )
|
||||
+ {
|
||||
+ static bool_t warned;
|
||||
+
|
||||
+ if ( !warned )
|
||||
+ {
|
||||
+ warned = 1;
|
||||
+ printk(XENLOG_WARNING "Segment register inaccessible for d%dv%d\n"
|
||||
+ "(If you see this outside of debugging activity,"
|
||||
+ " please report to xen-devel@lists.xenproject.org)\n",
|
||||
+ v->domain->domain_id, v->vcpu_id);
|
||||
+ }
|
||||
+ memset(reg, 0, sizeof(*reg));
|
||||
+ return;
|
||||
+ }
|
||||
|
||||
switch ( seg )
|
||||
{
|
||||
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
|
||||
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
|
||||
@@ -144,6 +144,7 @@ struct arch_vmx_struct {
|
||||
int vmx_create_vmcs(struct vcpu *v);
|
||||
void vmx_destroy_vmcs(struct vcpu *v);
|
||||
void vmx_vmcs_enter(struct vcpu *v);
|
||||
+bool_t __must_check vmx_vmcs_try_enter(struct vcpu *v);
|
||||
void vmx_vmcs_exit(struct vcpu *v);
|
||||
|
||||
#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004
|
84
5282492f-x86-eliminate-has_arch_mmios.patch
Normal file
84
5282492f-x86-eliminate-has_arch_mmios.patch
Normal file
@ -0,0 +1,84 @@
|
||||
# Commit 79233938ab2a8f273fd5dcdbf8e8381b9eb3a461
|
||||
# Date 2013-11-12 16:28:47 +0100
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: eliminate has_arch_mmios()
|
||||
|
||||
... as being generally insufficient: Either has_arch_pdevs() or
|
||||
cache_flush_permitted() should be used (in particular, it is
|
||||
insufficient to consider MMIO ranges alone - I/O port ranges have the
|
||||
same requirements if available to a guest).
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/hvm/hvm.c
|
||||
+++ b/xen/arch/x86/hvm/hvm.c
|
||||
@@ -40,6 +40,7 @@
|
||||
#include <asm/current.h>
|
||||
#include <asm/e820.h>
|
||||
#include <asm/io.h>
|
||||
+#include <asm/iocap.h>
|
||||
#include <asm/regs.h>
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/processor.h>
|
||||
@@ -1792,7 +1793,7 @@ int hvm_set_cr0(unsigned long value)
|
||||
}
|
||||
}
|
||||
|
||||
- if ( has_arch_mmios(v->domain) )
|
||||
+ if ( cache_flush_permitted(v->domain) )
|
||||
{
|
||||
if ( (value & X86_CR0_CD) && !(value & X86_CR0_NW) )
|
||||
{
|
||||
--- a/xen/arch/x86/hvm/svm/svm.c
|
||||
+++ b/xen/arch/x86/hvm/svm/svm.c
|
||||
@@ -40,6 +40,7 @@
|
||||
#include <asm/debugreg.h>
|
||||
#include <asm/msr.h>
|
||||
#include <asm/i387.h>
|
||||
+#include <asm/iocap.h>
|
||||
#include <asm/spinlock.h>
|
||||
#include <asm/hvm/emulate.h>
|
||||
#include <asm/hvm/hvm.h>
|
||||
@@ -1973,7 +1974,7 @@ static void wbinvd_ipi(void *info)
|
||||
|
||||
static void svm_wbinvd_intercept(void)
|
||||
{
|
||||
- if ( has_arch_mmios(current->domain) )
|
||||
+ if ( cache_flush_permitted(current->domain) )
|
||||
on_each_cpu(wbinvd_ipi, NULL, 1);
|
||||
}
|
||||
|
||||
--- a/xen/arch/x86/hvm/vmx/vmx.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vmx.c
|
||||
@@ -28,6 +28,7 @@
|
||||
#include <xen/perfc.h>
|
||||
#include <asm/current.h>
|
||||
#include <asm/io.h>
|
||||
+#include <asm/iocap.h>
|
||||
#include <asm/regs.h>
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/processor.h>
|
||||
@@ -2173,10 +2174,7 @@ static void wbinvd_ipi(void *info)
|
||||
|
||||
static void vmx_wbinvd_intercept(void)
|
||||
{
|
||||
- if ( !has_arch_mmios(current->domain) )
|
||||
- return;
|
||||
-
|
||||
- if ( iommu_snoop )
|
||||
+ if ( !cache_flush_permitted(current->domain) || iommu_snoop )
|
||||
return;
|
||||
|
||||
if ( cpu_has_wbinvd_exiting )
|
||||
--- a/xen/include/asm-x86/domain.h
|
||||
+++ b/xen/include/asm-x86/domain.h
|
||||
@@ -316,7 +316,6 @@ struct arch_domain
|
||||
} __cacheline_aligned;
|
||||
|
||||
#define has_arch_pdevs(d) (!list_empty(&(d)->arch.pdev_list))
|
||||
-#define has_arch_mmios(d) (!rangeset_is_empty((d)->iomem_caps))
|
||||
|
||||
#define gdt_ldt_pt_idx(v) \
|
||||
((v)->vcpu_id >> (PAGETABLE_ORDER - GDT_LDT_VCPU_SHIFT))
|
@ -0,0 +1,63 @@
|
||||
# Commit 1320b8100c2ed390fc640557a050f5c700d8338d
|
||||
# Date 2013-11-15 17:38:10 +0100
|
||||
# Author Nate Studer <nate.studer@dornerworks.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
credit: Update other parameters when setting tslice_ms
|
||||
|
||||
Add a utility function to update the rest of the timeslice
|
||||
accounting fields when updating the timeslice of the
|
||||
credit scheduler, so that capped CPUs behave correctly.
|
||||
|
||||
Before this patch changing the timeslice to a value higher
|
||||
than the default would result in a domain not utilizing
|
||||
its full capacity and changing the timeslice to a value
|
||||
lower than the default would result in a domain exceeding
|
||||
its capacity.
|
||||
|
||||
Signed-off-by: Nate Studer <nate.studer@dornerworks.com>
|
||||
Reviewed-by: Dario Faggioli <dario.faggioli@citrix.com>
|
||||
Reviewed-by: George Dunlap <george.dunlap@eu.citrix.com>
|
||||
|
||||
--- a/xen/common/sched_credit.c
|
||||
+++ b/xen/common/sched_credit.c
|
||||
@@ -1073,6 +1073,17 @@ csched_dom_cntl(
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static inline void
|
||||
+__csched_set_tslice(struct csched_private *prv, unsigned timeslice)
|
||||
+{
|
||||
+ prv->tslice_ms = timeslice;
|
||||
+ prv->ticks_per_tslice = CSCHED_TICKS_PER_TSLICE;
|
||||
+ if ( prv->tslice_ms < prv->ticks_per_tslice )
|
||||
+ prv->ticks_per_tslice = 1;
|
||||
+ prv->tick_period_us = prv->tslice_ms * 1000 / prv->ticks_per_tslice;
|
||||
+ prv->credits_per_tslice = CSCHED_CREDITS_PER_MSEC * prv->tslice_ms;
|
||||
+}
|
||||
+
|
||||
static int
|
||||
csched_sys_cntl(const struct scheduler *ops,
|
||||
struct xen_sysctl_scheduler_op *sc)
|
||||
@@ -1091,7 +1102,7 @@ csched_sys_cntl(const struct scheduler *
|
||||
|| params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN))
|
||||
|| MICROSECS(params->ratelimit_us) > MILLISECS(params->tslice_ms) )
|
||||
goto out;
|
||||
- prv->tslice_ms = params->tslice_ms;
|
||||
+ __csched_set_tslice(prv, params->tslice_ms);
|
||||
prv->ratelimit_us = params->ratelimit_us;
|
||||
/* FALLTHRU */
|
||||
case XEN_SYSCTL_SCHEDOP_getinfo:
|
||||
@@ -1903,12 +1914,7 @@ csched_init(struct scheduler *ops)
|
||||
sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS;
|
||||
}
|
||||
|
||||
- prv->tslice_ms = sched_credit_tslice_ms;
|
||||
- prv->ticks_per_tslice = CSCHED_TICKS_PER_TSLICE;
|
||||
- if ( prv->tslice_ms < prv->ticks_per_tslice )
|
||||
- prv->ticks_per_tslice = 1;
|
||||
- prv->tick_period_us = prv->tslice_ms * 1000 / prv->ticks_per_tslice;
|
||||
- prv->credits_per_tslice = CSCHED_CREDITS_PER_MSEC * prv->tslice_ms;
|
||||
+ __csched_set_tslice(prv, sched_credit_tslice_ms);
|
||||
|
||||
if ( MICROSECS(sched_ratelimit_us) > MILLISECS(sched_credit_tslice_ms) )
|
||||
{
|
@ -0,0 +1,21 @@
|
||||
# Commit 6757efe1bf50ac7ff68fa4dd7d9333529f70ae9a
|
||||
# Date 2013-11-15 17:43:28 +0100
|
||||
# Author Dario Faggioli <dario.faggioli@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
fix leaking of v->cpu_affinity_saved on domain destruction
|
||||
|
||||
Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
|
||||
Reviewed-by: George Dunlap <george.dunlap@eu.citrix.com>
|
||||
Acked-by: Ian Jackson <ian.jackson@eu.citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/xen/common/domain.c
|
||||
+++ b/xen/common/domain.c
|
||||
@@ -726,6 +726,7 @@ static void complete_domain_destroy(stru
|
||||
{
|
||||
free_cpumask_var(v->cpu_affinity);
|
||||
free_cpumask_var(v->cpu_affinity_tmp);
|
||||
+ free_cpumask_var(v->cpu_affinity_saved);
|
||||
free_cpumask_var(v->vcpu_dirty_cpumask);
|
||||
free_vcpu_struct(v);
|
||||
}
|
115
5289d225-nested-VMX-don-t-ignore-mapping-errors.patch
Normal file
115
5289d225-nested-VMX-don-t-ignore-mapping-errors.patch
Normal file
@ -0,0 +1,115 @@
|
||||
# Commit e02b14e531a95399fc9d8647ec3cc6f310a7d455
|
||||
# Date 2013-11-18 09:39:01 +0100
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
nested VMX: don't ignore mapping errors
|
||||
|
||||
Rather than ignoring failures to map the virtual VMCS as well as MSR or
|
||||
I/O port bitmaps, convert those into failures of the respective
|
||||
instructions (avoiding to dereference NULL pointers). Ultimately such
|
||||
failures should be handled transparently (by using transient mappings
|
||||
when they actually need to be accessed, just like nested SVM does).
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Eddie Dong <eddie.dong@intel.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/vmx/vvmx.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
|
||||
@@ -746,7 +746,7 @@ static void __clear_current_vvmcs(struct
|
||||
__vmpclear(virt_to_maddr(nvcpu->nv_n2vmcx));
|
||||
}
|
||||
|
||||
-static void __map_msr_bitmap(struct vcpu *v)
|
||||
+static bool_t __must_check _map_msr_bitmap(struct vcpu *v)
|
||||
{
|
||||
struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
|
||||
unsigned long gpa;
|
||||
@@ -755,9 +755,11 @@ static void __map_msr_bitmap(struct vcpu
|
||||
hvm_unmap_guest_frame(nvmx->msrbitmap, 1);
|
||||
gpa = __get_vvmcs(vcpu_nestedhvm(v).nv_vvmcx, MSR_BITMAP);
|
||||
nvmx->msrbitmap = hvm_map_guest_frame_ro(gpa >> PAGE_SHIFT, 1);
|
||||
+
|
||||
+ return nvmx->msrbitmap != NULL;
|
||||
}
|
||||
|
||||
-static void __map_io_bitmap(struct vcpu *v, u64 vmcs_reg)
|
||||
+static bool_t __must_check _map_io_bitmap(struct vcpu *v, u64 vmcs_reg)
|
||||
{
|
||||
struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
|
||||
unsigned long gpa;
|
||||
@@ -768,12 +770,14 @@ static void __map_io_bitmap(struct vcpu
|
||||
hvm_unmap_guest_frame(nvmx->iobitmap[index], 1);
|
||||
gpa = __get_vvmcs(vcpu_nestedhvm(v).nv_vvmcx, vmcs_reg);
|
||||
nvmx->iobitmap[index] = hvm_map_guest_frame_ro(gpa >> PAGE_SHIFT, 1);
|
||||
+
|
||||
+ return nvmx->iobitmap[index] != NULL;
|
||||
}
|
||||
|
||||
-static inline void map_io_bitmap_all(struct vcpu *v)
|
||||
+static inline bool_t __must_check map_io_bitmap_all(struct vcpu *v)
|
||||
{
|
||||
- __map_io_bitmap (v, IO_BITMAP_A);
|
||||
- __map_io_bitmap (v, IO_BITMAP_B);
|
||||
+ return _map_io_bitmap(v, IO_BITMAP_A) &&
|
||||
+ _map_io_bitmap(v, IO_BITMAP_B);
|
||||
}
|
||||
|
||||
static void nvmx_purge_vvmcs(struct vcpu *v)
|
||||
@@ -1609,9 +1613,15 @@ int nvmx_handle_vmptrld(struct cpu_user_
|
||||
if ( nvcpu->nv_vvmcxaddr == VMCX_EADDR )
|
||||
{
|
||||
nvcpu->nv_vvmcx = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 1);
|
||||
- nvcpu->nv_vvmcxaddr = gpa;
|
||||
- map_io_bitmap_all (v);
|
||||
- __map_msr_bitmap(v);
|
||||
+ if ( nvcpu->nv_vvmcx )
|
||||
+ nvcpu->nv_vvmcxaddr = gpa;
|
||||
+ if ( !nvcpu->nv_vvmcx ||
|
||||
+ !map_io_bitmap_all(v) ||
|
||||
+ !_map_msr_bitmap(v) )
|
||||
+ {
|
||||
+ vmreturn(regs, VMFAIL_VALID);
|
||||
+ goto out;
|
||||
+ }
|
||||
}
|
||||
|
||||
if ( cpu_has_vmx_vmcs_shadowing )
|
||||
@@ -1723,6 +1733,7 @@ int nvmx_handle_vmwrite(struct cpu_user_
|
||||
struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
|
||||
unsigned long operand;
|
||||
u64 vmcs_encoding;
|
||||
+ bool_t okay = 1;
|
||||
|
||||
if ( decode_vmx_inst(regs, &decode, &operand, 0)
|
||||
!= X86EMUL_OKAY )
|
||||
@@ -1731,16 +1742,21 @@ int nvmx_handle_vmwrite(struct cpu_user_
|
||||
vmcs_encoding = reg_read(regs, decode.reg2);
|
||||
__set_vvmcs(nvcpu->nv_vvmcx, vmcs_encoding, operand);
|
||||
|
||||
- if ( vmcs_encoding == IO_BITMAP_A || vmcs_encoding == IO_BITMAP_A_HIGH )
|
||||
- __map_io_bitmap (v, IO_BITMAP_A);
|
||||
- else if ( vmcs_encoding == IO_BITMAP_B ||
|
||||
- vmcs_encoding == IO_BITMAP_B_HIGH )
|
||||
- __map_io_bitmap (v, IO_BITMAP_B);
|
||||
+ switch ( vmcs_encoding )
|
||||
+ {
|
||||
+ case IO_BITMAP_A: case IO_BITMAP_A_HIGH:
|
||||
+ okay = _map_io_bitmap(v, IO_BITMAP_A);
|
||||
+ break;
|
||||
+ case IO_BITMAP_B: case IO_BITMAP_B_HIGH:
|
||||
+ okay = _map_io_bitmap(v, IO_BITMAP_B);
|
||||
+ break;
|
||||
+ case MSR_BITMAP: case MSR_BITMAP_HIGH:
|
||||
+ okay = _map_msr_bitmap(v);
|
||||
+ break;
|
||||
+ }
|
||||
|
||||
- if ( vmcs_encoding == MSR_BITMAP || vmcs_encoding == MSR_BITMAP_HIGH )
|
||||
- __map_msr_bitmap(v);
|
||||
+ vmreturn(regs, okay ? VMSUCCEED : VMFAIL_VALID);
|
||||
|
||||
- vmreturn(regs, VMSUCCEED);
|
||||
return X86EMUL_OKAY;
|
||||
}
|
||||
|
32
528a0e5b-TLB-flushing-in-dma_pte_clear_one.patch
Normal file
32
528a0e5b-TLB-flushing-in-dma_pte_clear_one.patch
Normal file
@ -0,0 +1,32 @@
|
||||
References: bnc#851386 CVE-2013-6375 XSA-78
|
||||
|
||||
# HG changeset patch
|
||||
# User Jan Beulich <jbeulich@suse.com>
|
||||
# Date 1384779355 -3600
|
||||
# Node ID 81fec8e36840041ca5779a4c4f2eed98180eda2e
|
||||
# Parent de9b11c80e2d3bd795d6329e0979c4734c3b4f96
|
||||
VT-d: fix TLB flushing in dma_pte_clear_one()
|
||||
|
||||
The third parameter of __intel_iommu_iotlb_flush() is to indicate
|
||||
whether the to be flushed entry was a present one. A few lines before,
|
||||
we bailed if !dma_pte_present(*pte), so there's no need to check the
|
||||
flag here again - we can simply always pass TRUE here.
|
||||
|
||||
This is CVE-2013-6375 / XSA-78.
|
||||
|
||||
Suggested-by: Cheng Yueqiang <yqcheng.2008@phdis.smu.edu.sg>
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/drivers/passthrough/vtd/iommu.c
|
||||
+++ b/xen/drivers/passthrough/vtd/iommu.c
|
||||
@@ -646,7 +646,7 @@ static void dma_pte_clear_one(struct dom
|
||||
iommu_flush_cache_entry(pte, sizeof(struct dma_pte));
|
||||
|
||||
if ( !this_cpu(iommu_dont_flush_iotlb) )
|
||||
- __intel_iommu_iotlb_flush(domain, addr >> PAGE_SHIFT_4K , 0, 1);
|
||||
+ __intel_iommu_iotlb_flush(domain, addr >> PAGE_SHIFT_4K, 1, 1);
|
||||
|
||||
unmap_vtd_domain_page(page);
|
||||
|
40
528a0eb0-x86-consider-modules-when-cutting-off-memory.patch
Normal file
40
528a0eb0-x86-consider-modules-when-cutting-off-memory.patch
Normal file
@ -0,0 +1,40 @@
|
||||
References: bnc#848014
|
||||
|
||||
# Commit a5db2c7aab7a638d84f22ac8fe5089d81175438b
|
||||
# Date 2013-11-18 13:57:20 +0100
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: consider modules when cutting off memory
|
||||
|
||||
The code in question runs after module ranges got already removed from
|
||||
the E820 table, so when determining the new maximum page/PDX we need to
|
||||
explicitly take them into account.
|
||||
|
||||
Furthermore we need to round up the ending addresses here, in order to
|
||||
fully cover eventual partial trailing pages.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Keir Fraser <keir@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/setup.c
|
||||
+++ b/xen/arch/x86/setup.c
|
||||
@@ -1012,9 +1012,17 @@ void __init __start_xen(unsigned long mb
|
||||
ASSERT(j);
|
||||
}
|
||||
map_e = boot_e820.map[j].addr + boot_e820.map[j].size;
|
||||
- if ( (map_e >> PAGE_SHIFT) < max_page )
|
||||
+ for ( j = 0; j < mbi->mods_count; ++j )
|
||||
{
|
||||
- max_page = map_e >> PAGE_SHIFT;
|
||||
+ uint64_t end = pfn_to_paddr(mod[j].mod_start) +
|
||||
+ mod[j].mod_end;
|
||||
+
|
||||
+ if ( map_e < end )
|
||||
+ map_e = end;
|
||||
+ }
|
||||
+ if ( PFN_UP(map_e) < max_page )
|
||||
+ {
|
||||
+ max_page = PFN_UP(map_e);
|
||||
max_pdx = pfn_to_pdx(max_page - 1) + 1;
|
||||
}
|
||||
printk(XENLOG_WARNING "Ignoring inaccessible memory range"
|
@ -0,0 +1,30 @@
|
||||
# Commit e95dc6ba69daef6468b3ae5912710727244d6e2f
|
||||
# Date 2013-11-22 14:47:24 +0100
|
||||
# Author Tomasz Wroblewski <tomasz.wroblewski@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/hvm: reset TSC to 0 after domain resume from S3
|
||||
|
||||
Host S3 implicitly resets the host TSC to 0, but the tsc offset for hvm
|
||||
domains is not recalculated when they resume, causing it to go into
|
||||
negative values. In Linux guest using tsc clocksource, this results in
|
||||
a hang after wrap back to positive values since the tsc clocksource
|
||||
implementation expects it reset.
|
||||
|
||||
Signed-off-by: Tomasz Wroblewski <tomasz.wroblewski@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/hvm.c
|
||||
+++ b/xen/arch/x86/hvm/hvm.c
|
||||
@@ -3607,7 +3607,13 @@ static void hvm_s3_suspend(struct domain
|
||||
static void hvm_s3_resume(struct domain *d)
|
||||
{
|
||||
if ( test_and_clear_bool(d->arch.hvm_domain.is_s3_suspended) )
|
||||
+ {
|
||||
+ struct vcpu *v;
|
||||
+
|
||||
+ for_each_vcpu( d, v )
|
||||
+ hvm_set_guest_tsc(v, 0);
|
||||
domain_unpause(d);
|
||||
+ }
|
||||
}
|
||||
|
||||
static int hvmop_set_isa_irq_level(
|
@ -0,0 +1,60 @@
|
||||
# Commit 2a16fcd5ba0244fef764886211452acc69c0ed00
|
||||
# Date 2013-11-22 14:48:12 +0100
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/crash: disable the watchdog NMIs on the crashing cpu
|
||||
|
||||
nmi_shootdown_cpus() is called during a crash to park all the other
|
||||
CPUs. This changes the NMI trap handlers which means there's no point
|
||||
in having the watchdog still running.
|
||||
|
||||
This also disables the watchdog before executing any crash kexec image
|
||||
and prevents the image from receiving unexpected NMIs.
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
|
||||
PVOps Linux as a kexec image shoots itself in the foot otherwise.
|
||||
|
||||
On a Core2 system, Linux declares a firmware bug and tries to invert some bits
|
||||
in the performance counter register. It ends up setting the number of retired
|
||||
instructions to generate another NMI to fewer instructions than the NMI
|
||||
interrupt path itself, and ceases to make any useful progress.
|
||||
|
||||
The call to disable_lapic_nmi_watchdog() must be this late into the kexec path
|
||||
to be sure that this cpu is the one which will execute the kexec image.
|
||||
Otherwise there are race conditions where the NMIs might be disabled on the
|
||||
wrong cpu, resulting in the kexec image still receiving NMIs.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/crash.c
|
||||
+++ b/xen/arch/x86/crash.c
|
||||
@@ -117,6 +117,7 @@ static void nmi_shootdown_cpus(void)
|
||||
unsigned long msecs;
|
||||
int i, cpu = smp_processor_id();
|
||||
|
||||
+ disable_lapic_nmi_watchdog();
|
||||
local_irq_disable();
|
||||
|
||||
crashing_cpu = cpu;
|
||||
--- a/xen/arch/x86/nmi.c
|
||||
+++ b/xen/arch/x86/nmi.c
|
||||
@@ -165,7 +165,7 @@ static void nmi_timer_fn(void *unused)
|
||||
set_timer(&this_cpu(nmi_timer), NOW() + MILLISECS(1000));
|
||||
}
|
||||
|
||||
-static void disable_lapic_nmi_watchdog(void)
|
||||
+void disable_lapic_nmi_watchdog(void)
|
||||
{
|
||||
if (nmi_active <= 0)
|
||||
return;
|
||||
--- a/xen/include/asm-x86/apic.h
|
||||
+++ b/xen/include/asm-x86/apic.h
|
||||
@@ -200,6 +200,7 @@ extern void smp_local_timer_interrupt (s
|
||||
extern void setup_boot_APIC_clock (void);
|
||||
extern void setup_secondary_APIC_clock (void);
|
||||
extern void setup_apic_nmi_watchdog (void);
|
||||
+extern void disable_lapic_nmi_watchdog(void);
|
||||
extern int reserve_lapic_nmi(void);
|
||||
extern void release_lapic_nmi(void);
|
||||
extern void self_nmi(void);
|
89
52932418-x86-xsave-fix-nonlazy-state-handling.patch
Normal file
89
52932418-x86-xsave-fix-nonlazy-state-handling.patch
Normal file
@ -0,0 +1,89 @@
|
||||
# Commit 7d8b5dd98463524686bdee8b973b53c00c232122
|
||||
# Date 2013-11-25 11:19:04 +0100
|
||||
# Author Liu Jinsong <jinsong.liu@intel.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/xsave: fix nonlazy state handling
|
||||
|
||||
Nonlazy xstates should be xsaved each time when vcpu_save_fpu.
|
||||
Operation to nonlazy xstates will not trigger #NM exception, so
|
||||
whenever vcpu scheduled in it got restored and whenever scheduled
|
||||
out it should get saved.
|
||||
|
||||
Currently this bug affects AMD LWP feature, and later Intel MPX
|
||||
feature. With the bugfix both LWP and MPX will work fine.
|
||||
|
||||
Signed-off-by: Liu Jinsong <jinsong.liu@intel.com>
|
||||
|
||||
Furthermore, during restore we also need to set nonlazy_xstate_used
|
||||
according to the incoming accumulated XCR0.
|
||||
|
||||
Also adjust the changes to i387.c such that there won't be a pointless
|
||||
clts()/stts() pair.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/xen/arch/x86/domctl.c
|
||||
+++ b/xen/arch/x86/domctl.c
|
||||
@@ -1146,6 +1146,8 @@ long arch_do_domctl(
|
||||
{
|
||||
v->arch.xcr0 = _xcr0;
|
||||
v->arch.xcr0_accum = _xcr0_accum;
|
||||
+ if ( _xcr0_accum & XSTATE_NONLAZY )
|
||||
+ v->arch.nonlazy_xstate_used = 1;
|
||||
memcpy(v->arch.xsave_area, _xsave_area,
|
||||
evc->size - 2 * sizeof(uint64_t));
|
||||
}
|
||||
--- a/xen/arch/x86/hvm/hvm.c
|
||||
+++ b/xen/arch/x86/hvm/hvm.c
|
||||
@@ -1073,6 +1073,8 @@ static int hvm_load_cpu_xsave_states(str
|
||||
|
||||
v->arch.xcr0 = ctxt->xcr0;
|
||||
v->arch.xcr0_accum = ctxt->xcr0_accum;
|
||||
+ if ( ctxt->xcr0_accum & XSTATE_NONLAZY )
|
||||
+ v->arch.nonlazy_xstate_used = 1;
|
||||
memcpy(v->arch.xsave_area, &ctxt->save_area,
|
||||
desc->length - offsetof(struct hvm_hw_cpu_xsave, save_area));
|
||||
|
||||
--- a/xen/arch/x86/i387.c
|
||||
+++ b/xen/arch/x86/i387.c
|
||||
@@ -120,11 +120,22 @@ static inline void fpu_frstor(struct vcp
|
||||
/*******************************/
|
||||
/* FPU Save Functions */
|
||||
/*******************************/
|
||||
+
|
||||
+static inline uint64_t vcpu_xsave_mask(const struct vcpu *v)
|
||||
+{
|
||||
+ if ( v->fpu_dirtied )
|
||||
+ return v->arch.nonlazy_xstate_used ? XSTATE_ALL : XSTATE_LAZY;
|
||||
+
|
||||
+ return v->arch.nonlazy_xstate_used ? XSTATE_NONLAZY : 0;
|
||||
+}
|
||||
+
|
||||
/* Save x87 extended state */
|
||||
static inline void fpu_xsave(struct vcpu *v)
|
||||
{
|
||||
bool_t ok;
|
||||
+ uint64_t mask = vcpu_xsave_mask(v);
|
||||
|
||||
+ ASSERT(mask);
|
||||
ASSERT(v->arch.xsave_area);
|
||||
/*
|
||||
* XCR0 normally represents what guest OS set. In case of Xen itself,
|
||||
@@ -132,7 +143,7 @@ static inline void fpu_xsave(struct vcpu
|
||||
*/
|
||||
ok = set_xcr0(v->arch.xcr0_accum | XSTATE_FP_SSE);
|
||||
ASSERT(ok);
|
||||
- xsave(v, v->arch.nonlazy_xstate_used ? XSTATE_ALL : XSTATE_LAZY);
|
||||
+ xsave(v, mask);
|
||||
ok = set_xcr0(v->arch.xcr0 ?: XSTATE_FP_SSE);
|
||||
ASSERT(ok);
|
||||
}
|
||||
@@ -263,7 +274,7 @@ void vcpu_restore_fpu_lazy(struct vcpu *
|
||||
*/
|
||||
void vcpu_save_fpu(struct vcpu *v)
|
||||
{
|
||||
- if ( !v->fpu_dirtied )
|
||||
+ if ( !v->fpu_dirtied && !v->arch.nonlazy_xstate_used )
|
||||
return;
|
||||
|
||||
ASSERT(!is_idle_vcpu(v));
|
@ -1,33 +0,0 @@
|
||||
References: bnc#842515 CVE-2013-4375 XSA-71
|
||||
|
||||
xen_disk: mark ioreq as mapped before unmapping in error case
|
||||
|
||||
Commit c6961b7d ("xen_disk: use bdrv_aio_flush instead of bdrv_flush")
|
||||
modified the semantics of ioreq_{un,}map so that they are idempotent if
|
||||
called when they're not needed (ie., twice in a row). However, it neglected
|
||||
to handle the case where batch mapping is not being used (the default), and
|
||||
one of the grants fails to map. In this case, ioreq_unmap will be called to
|
||||
unwind and unmap any mappings already performed, but ioreq_unmap simply
|
||||
returns due to the aforementioned change (the ioreq has not already been
|
||||
marked as mapped).
|
||||
|
||||
The frontend user can therefore force xen_disk to leak grant mappings, a
|
||||
per-backend-domain limited resource.
|
||||
|
||||
Fix by marking the ioreq as mapped before calling ioreq_unmap in this
|
||||
situation.
|
||||
|
||||
This is XSA-71 / CVE-2013-4375
|
||||
|
||||
Signed-off-by: Matthew Daley <mattjd@gmail.com>
|
||||
|
||||
--- a/tools/qemu-xen-dir-remote/hw/xen_disk.c
|
||||
+++ b/tools/qemu-xen-dir-remote/hw/xen_disk.c
|
||||
@@ -406,6 +406,7 @@ static int ioreq_map(struct ioreq *ioreq
|
||||
xen_be_printf(&ioreq->blkdev->xendev, 0,
|
||||
"can't map grant ref %d (%s, %d maps)\n",
|
||||
refs[i], strerror(errno), ioreq->blkdev->cnt_map);
|
||||
+ ioreq->mapped = 1;
|
||||
ioreq_unmap(ioreq);
|
||||
return -1;
|
||||
}
|
43
CVE-2013-4553-xsa74.patch
Normal file
43
CVE-2013-4553-xsa74.patch
Normal file
@ -0,0 +1,43 @@
|
||||
References: bnc#849667 CVE-2013-4553 XSA-74
|
||||
|
||||
x86: restrict XEN_DOMCTL_getmemlist
|
||||
|
||||
Coverity ID 1055652
|
||||
|
||||
(See the code comment.)
|
||||
|
||||
This is CVE-2013-4553 / XSA-74.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: Tim Deegan <tim@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/domctl.c
|
||||
+++ b/xen/arch/x86/domctl.c
|
||||
@@ -329,6 +329,26 @@ long arch_do_domctl(
|
||||
break;
|
||||
}
|
||||
|
||||
+ /*
|
||||
+ * XSA-74: This sub-hypercall is broken in several ways:
|
||||
+ * - lock order inversion (p2m locks inside page_alloc_lock)
|
||||
+ * - no preemption on huge max_pfns input
|
||||
+ * - not (re-)checking d->is_dying with page_alloc_lock held
|
||||
+ * - not honoring start_pfn input (which libxc also doesn't set)
|
||||
+ * Additionally it is rather useless, as the result is stale by the
|
||||
+ * time the caller gets to look at it.
|
||||
+ * As it only has a single, non-production consumer (xen-mceinj),
|
||||
+ * rather than trying to fix it we restrict it for the time being.
|
||||
+ */
|
||||
+ if ( /* No nested locks inside copy_to_guest_offset(). */
|
||||
+ paging_mode_external(current->domain) ||
|
||||
+ /* Arbitrary limit capping processing time. */
|
||||
+ max_pfns > GB(4) / PAGE_SIZE )
|
||||
+ {
|
||||
+ ret = -EOPNOTSUPP;
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
spin_lock(&d->page_alloc_lock);
|
||||
|
||||
ret = i = 0;
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user