206 lines
6.8 KiB
Diff
206 lines
6.8 KiB
Diff
|
# Commit 101ce53266866144e724ed593173bc4098b300b9
|
||
|
# Date 2015-10-29 13:36:25 +0100
|
||
|
# Author Andrew Cooper <andrew.cooper3@citrix.com>
|
||
|
# Committer Jan Beulich <jbeulich@suse.com>
|
||
|
x86/PoD: Eager sweep for zeroed pages
|
||
|
|
||
|
Based on the contents of a guests physical address space,
|
||
|
p2m_pod_emergency_sweep() could degrade into a linear memcmp() from 0 to
|
||
|
max_gfn, which runs non-preemptibly.
|
||
|
|
||
|
As p2m_pod_emergency_sweep() runs behind the scenes in a number of contexts,
|
||
|
making it preemptible is not feasible.
|
||
|
|
||
|
Instead, a different approach is taken. Recently-populated pages are eagerly
|
||
|
checked for reclaimation, which amortises the p2m_pod_emergency_sweep()
|
||
|
operation across each p2m_pod_demand_populate() operation.
|
||
|
|
||
|
Note that in the case that a 2M superpage can't be reclaimed as a superpage,
|
||
|
it is shattered if 4K pages of zeros can be reclaimed. This is unfortunate
|
||
|
but matches the previous behaviour, and is required to avoid regressions
|
||
|
(domain crash from PoD exhaustion) with VMs configured close to the limit.
|
||
|
|
||
|
This is CVE-2015-7970 / XSA-150.
|
||
|
|
||
|
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||
|
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||
|
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
|
||
|
|
||
|
--- a/xen/arch/x86/mm/p2m-pod.c
|
||
|
+++ b/xen/arch/x86/mm/p2m-pod.c
|
||
|
@@ -901,28 +901,6 @@ p2m_pod_zero_check(struct p2m_domain *p2
|
||
|
}
|
||
|
|
||
|
#define POD_SWEEP_LIMIT 1024
|
||
|
-
|
||
|
-/* When populating a new superpage, look at recently populated superpages
|
||
|
- * hoping that they've been zeroed. This will snap up zeroed pages as soon as
|
||
|
- * the guest OS is done with them. */
|
||
|
-static void
|
||
|
-p2m_pod_check_last_super(struct p2m_domain *p2m, unsigned long gfn_aligned)
|
||
|
-{
|
||
|
- unsigned long check_gfn;
|
||
|
-
|
||
|
- ASSERT(p2m->pod.last_populated_index < POD_HISTORY_MAX);
|
||
|
-
|
||
|
- check_gfn = p2m->pod.last_populated[p2m->pod.last_populated_index];
|
||
|
-
|
||
|
- p2m->pod.last_populated[p2m->pod.last_populated_index] = gfn_aligned;
|
||
|
-
|
||
|
- p2m->pod.last_populated_index =
|
||
|
- ( p2m->pod.last_populated_index + 1 ) % POD_HISTORY_MAX;
|
||
|
-
|
||
|
- p2m_pod_zero_check_superpage(p2m, check_gfn);
|
||
|
-}
|
||
|
-
|
||
|
-
|
||
|
#define POD_SWEEP_STRIDE 16
|
||
|
static void
|
||
|
p2m_pod_emergency_sweep(struct p2m_domain *p2m)
|
||
|
@@ -963,7 +941,7 @@ p2m_pod_emergency_sweep(struct p2m_domai
|
||
|
* NB that this is a zero-sum game; we're increasing our cache size
|
||
|
* by re-increasing our 'debt'. Since we hold the pod lock,
|
||
|
* (entry_count - count) must remain the same. */
|
||
|
- if ( p2m->pod.count > 0 && i < limit )
|
||
|
+ if ( i < limit && (p2m->pod.count > 0 || hypercall_preempt_check()) )
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
@@ -975,6 +953,58 @@ p2m_pod_emergency_sweep(struct p2m_domai
|
||
|
|
||
|
}
|
||
|
|
||
|
+static void pod_eager_reclaim(struct p2m_domain *p2m)
|
||
|
+{
|
||
|
+ struct pod_mrp_list *mrp = &p2m->pod.mrp;
|
||
|
+ unsigned int i = 0;
|
||
|
+
|
||
|
+ /*
|
||
|
+ * Always check one page for reclaimation.
|
||
|
+ *
|
||
|
+ * If the PoD pool is empty, keep checking some space is found, or all
|
||
|
+ * entries have been exhaused.
|
||
|
+ */
|
||
|
+ do
|
||
|
+ {
|
||
|
+ unsigned int idx = (mrp->idx + i++) % ARRAY_SIZE(mrp->list);
|
||
|
+ unsigned long gfn = mrp->list[idx];
|
||
|
+
|
||
|
+ if ( gfn != INVALID_GFN )
|
||
|
+ {
|
||
|
+ if ( gfn & POD_LAST_SUPERPAGE )
|
||
|
+ {
|
||
|
+ gfn &= ~POD_LAST_SUPERPAGE;
|
||
|
+
|
||
|
+ if ( p2m_pod_zero_check_superpage(p2m, gfn) == 0 )
|
||
|
+ {
|
||
|
+ unsigned int x;
|
||
|
+
|
||
|
+ for ( x = 0; x < SUPERPAGE_PAGES; ++x, ++gfn )
|
||
|
+ p2m_pod_zero_check(p2m, &gfn, 1);
|
||
|
+ }
|
||
|
+ }
|
||
|
+ else
|
||
|
+ p2m_pod_zero_check(p2m, &gfn, 1);
|
||
|
+
|
||
|
+ mrp->list[idx] = INVALID_GFN;
|
||
|
+ }
|
||
|
+
|
||
|
+ } while ( (p2m->pod.count == 0) && (i < ARRAY_SIZE(mrp->list)) );
|
||
|
+}
|
||
|
+
|
||
|
+static void pod_eager_record(struct p2m_domain *p2m,
|
||
|
+ unsigned long gfn, unsigned int order)
|
||
|
+{
|
||
|
+ struct pod_mrp_list *mrp = &p2m->pod.mrp;
|
||
|
+
|
||
|
+ ASSERT(mrp->list[mrp->idx] == INVALID_GFN);
|
||
|
+ ASSERT(gfn != INVALID_GFN);
|
||
|
+
|
||
|
+ mrp->list[mrp->idx++] =
|
||
|
+ gfn | (order == PAGE_ORDER_2M ? POD_LAST_SUPERPAGE : 0);
|
||
|
+ mrp->idx %= ARRAY_SIZE(mrp->list);
|
||
|
+}
|
||
|
+
|
||
|
int
|
||
|
p2m_pod_demand_populate(struct p2m_domain *p2m, unsigned long gfn,
|
||
|
unsigned int order,
|
||
|
@@ -1015,6 +1045,8 @@ p2m_pod_demand_populate(struct p2m_domai
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
+ pod_eager_reclaim(p2m);
|
||
|
+
|
||
|
/* Only sweep if we're actually out of memory. Doing anything else
|
||
|
* causes unnecessary time and fragmentation of superpages in the p2m. */
|
||
|
if ( p2m->pod.count == 0 )
|
||
|
@@ -1051,6 +1083,8 @@ p2m_pod_demand_populate(struct p2m_domai
|
||
|
p2m->pod.entry_count -= (1 << order);
|
||
|
BUG_ON(p2m->pod.entry_count < 0);
|
||
|
|
||
|
+ pod_eager_record(p2m, gfn_aligned, order);
|
||
|
+
|
||
|
if ( tb_init_done )
|
||
|
{
|
||
|
struct {
|
||
|
@@ -1066,12 +1100,6 @@ p2m_pod_demand_populate(struct p2m_domai
|
||
|
__trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t);
|
||
|
}
|
||
|
|
||
|
- /* Check the last guest demand-populate */
|
||
|
- if ( p2m->pod.entry_count > p2m->pod.count
|
||
|
- && (order == PAGE_ORDER_2M)
|
||
|
- && (q & P2M_ALLOC) )
|
||
|
- p2m_pod_check_last_super(p2m, gfn_aligned);
|
||
|
-
|
||
|
pod_unlock(p2m);
|
||
|
return 0;
|
||
|
out_of_memory:
|
||
|
--- a/xen/arch/x86/mm/p2m.c
|
||
|
+++ b/xen/arch/x86/mm/p2m.c
|
||
|
@@ -60,6 +60,7 @@ boolean_param("hap_2mb", opt_hap_2mb);
|
||
|
/* Init the datastructures for later use by the p2m code */
|
||
|
static int p2m_initialise(struct domain *d, struct p2m_domain *p2m)
|
||
|
{
|
||
|
+ unsigned int i;
|
||
|
int ret = 0;
|
||
|
|
||
|
mm_rwlock_init(&p2m->lock);
|
||
|
@@ -75,6 +76,9 @@ static int p2m_initialise(struct domain
|
||
|
|
||
|
p2m->np2m_base = P2M_BASE_EADDR;
|
||
|
|
||
|
+ for ( i = 0; i < ARRAY_SIZE(p2m->pod.mrp.list); ++i )
|
||
|
+ p2m->pod.mrp.list[i] = INVALID_GFN;
|
||
|
+
|
||
|
if ( hap_enabled(d) && cpu_has_vmx )
|
||
|
ret = ept_p2m_init(p2m);
|
||
|
else
|
||
|
--- a/xen/include/asm-x86/p2m.h
|
||
|
+++ b/xen/include/asm-x86/p2m.h
|
||
|
@@ -292,10 +292,20 @@ struct p2m_domain {
|
||
|
entry_count; /* # of pages in p2m marked pod */
|
||
|
unsigned long reclaim_single; /* Last gpfn of a scan */
|
||
|
unsigned long max_guest; /* gpfn of max guest demand-populate */
|
||
|
-#define POD_HISTORY_MAX 128
|
||
|
- /* gpfn of last guest superpage demand-populated */
|
||
|
- unsigned long last_populated[POD_HISTORY_MAX];
|
||
|
- unsigned int last_populated_index;
|
||
|
+
|
||
|
+ /*
|
||
|
+ * Tracking of the most recently populated PoD pages, for eager
|
||
|
+ * reclamation.
|
||
|
+ */
|
||
|
+ struct pod_mrp_list {
|
||
|
+#define NR_POD_MRP_ENTRIES 32
|
||
|
+
|
||
|
+/* Encode ORDER_2M superpage in top bit of GFN */
|
||
|
+#define POD_LAST_SUPERPAGE (INVALID_GFN & ~(INVALID_GFN >> 1))
|
||
|
+
|
||
|
+ unsigned long list[NR_POD_MRP_ENTRIES];
|
||
|
+ unsigned int idx;
|
||
|
+ } mrp;
|
||
|
mm_lock_t lock; /* Locking of private pod structs, *
|
||
|
* not relying on the p2m lock. */
|
||
|
} pod;
|