xen/18412-x86-page-type-preemptible.patch

1422 lines
45 KiB
Diff

# HG changeset patch
# User Keir Fraser <keir.fraser@citrix.com>
# Date 1220262725 -3600
# Node ID 86b956d8cf046d071c828ca9e461311f68fc0c6e
# Parent 7cb51e8484f67e32c1cc169948d63cd5579fd5bf
x86: make {get,put}_page_type() preemptible
This is only a first step - more call sites need to be hooked up.
Most of this is really Keir's work, I just took what he handed me and
fixed a few remaining issues.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
Index: xen-3.3.1-testing/xen/arch/x86/domain.c
===================================================================
--- xen-3.3.1-testing.orig/xen/arch/x86/domain.c
+++ xen-3.3.1-testing/xen/arch/x86/domain.c
@@ -1646,23 +1646,26 @@ static int relinquish_memory(
/*
* Forcibly invalidate top-most, still valid page tables at this point
- * to break circular 'linear page table' references. This is okay
- * because MMU structures are not shared across domains and this domain
- * is now dead. Thus top-most valid tables are not in use so a non-zero
- * count means circular reference.
+ * to break circular 'linear page table' references as well as clean up
+ * partially validated pages. This is okay because MMU structures are
+ * not shared across domains and this domain is now dead. Thus top-most
+ * valid tables are not in use so a non-zero count means circular
+ * reference or partially validated.
*/
y = page->u.inuse.type_info;
for ( ; ; )
{
x = y;
- if ( likely((x & (PGT_type_mask|PGT_validated)) !=
- (type|PGT_validated)) )
+ if ( likely((x & PGT_type_mask) != type) ||
+ likely(!(x & (PGT_validated|PGT_partial))) )
break;
- y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
+ y = cmpxchg(&page->u.inuse.type_info, x,
+ x & ~(PGT_validated|PGT_partial));
if ( likely(y == x) )
{
- free_page_type(page, type);
+ if ( free_page_type(page, x, 0) != 0 )
+ BUG();
break;
}
}
Index: xen-3.3.1-testing/xen/arch/x86/mm.c
===================================================================
--- xen-3.3.1-testing.orig/xen/arch/x86/mm.c
+++ xen-3.3.1-testing/xen/arch/x86/mm.c
@@ -507,11 +507,11 @@ static int alloc_segdesc_page(struct pag
goto fail;
unmap_domain_page(descs);
- return 1;
+ return 0;
fail:
unmap_domain_page(descs);
- return 0;
+ return -EINVAL;
}
@@ -565,20 +565,23 @@ static int get_page_from_pagenr(unsigned
static int get_page_and_type_from_pagenr(unsigned long page_nr,
unsigned long type,
- struct domain *d)
+ struct domain *d,
+ int preemptible)
{
struct page_info *page = mfn_to_page(page_nr);
+ int rc;
if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
- return 0;
+ return -EINVAL;
- if ( unlikely(!get_page_type(page, type)) )
- {
+ rc = (preemptible ?
+ get_page_type_preemptible(page, type) :
+ (get_page_type(page, type) ? 0 : -EINVAL));
+
+ if ( rc )
put_page(page);
- return 0;
- }
- return 1;
+ return rc;
}
/*
@@ -754,22 +757,23 @@ get_page_from_l2e(
if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
{
MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
- return 0;
+ return -EINVAL;
}
- rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d);
- if ( unlikely(!rc) )
- rc = get_l2_linear_pagetable(l2e, pfn, d);
+ rc = get_page_and_type_from_pagenr(
+ l2e_get_pfn(l2e), PGT_l1_page_table, d, 0);
+ if ( unlikely(rc) && rc != -EAGAIN &&
+ get_l2_linear_pagetable(l2e, pfn, d) )
+ rc = -EINVAL;
return rc;
}
-#if CONFIG_PAGING_LEVELS >= 3
define_get_linear_pagetable(l3);
static int
get_page_from_l3e(
- l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
+ l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int preemptible)
{
int rc;
@@ -779,22 +783,23 @@ get_page_from_l3e(
if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
{
MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
- return 0;
+ return -EINVAL;
}
- rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d);
- if ( unlikely(!rc) )
- rc = get_l3_linear_pagetable(l3e, pfn, d);
+ rc = get_page_and_type_from_pagenr(
+ l3e_get_pfn(l3e), PGT_l2_page_table, d, preemptible);
+ if ( unlikely(rc) && rc != -EAGAIN && rc != -EINTR &&
+ get_l3_linear_pagetable(l3e, pfn, d) )
+ rc = -EINVAL;
return rc;
}
-#endif /* 3 level */
#if CONFIG_PAGING_LEVELS >= 4
define_get_linear_pagetable(l4);
static int
get_page_from_l4e(
- l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
+ l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int preemptible)
{
int rc;
@@ -804,12 +809,14 @@ get_page_from_l4e(
if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
{
MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
- return 0;
+ return -EINVAL;
}
- rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d);
- if ( unlikely(!rc) )
- rc = get_l4_linear_pagetable(l4e, pfn, d);
+ rc = get_page_and_type_from_pagenr(
+ l4e_get_pfn(l4e), PGT_l3_page_table, d, preemptible);
+ if ( unlikely(rc) && rc != -EAGAIN && rc != -EINTR &&
+ get_l4_linear_pagetable(l4e, pfn, d) )
+ rc = -EINVAL;
return rc;
}
@@ -946,29 +953,35 @@ void put_page_from_l1e(l1_pgentry_t l1e,
* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
* Note also that this automatically deals correctly with linear p.t.'s.
*/
-static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
+static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
{
if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
(l2e_get_pfn(l2e) != pfn) )
+ {
put_page_and_type(l2e_get_page(l2e));
+ return 0;
+ }
+ return 1;
}
-#if CONFIG_PAGING_LEVELS >= 3
-static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
+static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
+ int preemptible)
{
if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
(l3e_get_pfn(l3e) != pfn) )
- put_page_and_type(l3e_get_page(l3e));
+ return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
+ return 1;
}
-#endif
#if CONFIG_PAGING_LEVELS >= 4
-static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
+static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
+ int preemptible)
{
if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
(l4e_get_pfn(l4e) != pfn) )
- put_page_and_type(l4e_get_page(l4e));
+ return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
+ return 1;
}
#endif
@@ -977,7 +990,7 @@ static int alloc_l1_table(struct page_in
struct domain *d = page_get_owner(page);
unsigned long pfn = page_to_mfn(page);
l1_pgentry_t *pl1e;
- int i;
+ unsigned int i;
pl1e = map_domain_page(pfn);
@@ -991,7 +1004,7 @@ static int alloc_l1_table(struct page_in
}
unmap_domain_page(pl1e);
- return 1;
+ return 0;
fail:
MEM_LOG("Failure in alloc_l1_table: entry %d", i);
@@ -1000,7 +1013,7 @@ static int alloc_l1_table(struct page_in
put_page_from_l1e(pl1e[i], d);
unmap_domain_page(pl1e);
- return 0;
+ return -EINVAL;
}
static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
@@ -1128,47 +1141,53 @@ static void pae_flush_pgd(
# define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
#endif
-static int alloc_l2_table(struct page_info *page, unsigned long type)
+static int alloc_l2_table(struct page_info *page, unsigned long type,
+ int preemptible)
{
struct domain *d = page_get_owner(page);
unsigned long pfn = page_to_mfn(page);
l2_pgentry_t *pl2e;
- int i;
+ unsigned int i;
+ int rc = 0;
pl2e = map_domain_page(pfn);
- for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
+ for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
{
- if ( !is_guest_l2_slot(d, type, i) )
+ if ( preemptible && i && hypercall_preempt_check() )
+ {
+ page->nr_validated_ptes = i;
+ rc = -EAGAIN;
+ break;
+ }
+
+ if ( !is_guest_l2_slot(d, type, i) ||
+ (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
continue;
- if ( unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) )
- goto fail;
-
+ if ( rc < 0 )
+ {
+ MEM_LOG("Failure in alloc_l2_table: entry %d", i);
+ while ( i-- > 0 )
+ if ( is_guest_l2_slot(d, type, i) )
+ put_page_from_l2e(pl2e[i], pfn);
+ break;
+ }
+
adjust_guest_l2e(pl2e[i], d);
}
unmap_domain_page(pl2e);
- return 1;
-
- fail:
- MEM_LOG("Failure in alloc_l2_table: entry %d", i);
- while ( i-- > 0 )
- if ( is_guest_l2_slot(d, type, i) )
- put_page_from_l2e(pl2e[i], pfn);
-
- unmap_domain_page(pl2e);
- return 0;
+ return rc > 0 ? 0 : rc;
}
-
-#if CONFIG_PAGING_LEVELS >= 3
-static int alloc_l3_table(struct page_info *page)
+static int alloc_l3_table(struct page_info *page, int preemptible)
{
struct domain *d = page_get_owner(page);
unsigned long pfn = page_to_mfn(page);
l3_pgentry_t *pl3e;
- int i;
+ unsigned int i;
+ int rc = 0;
#if CONFIG_PAGING_LEVELS == 3
/*
@@ -1181,7 +1200,7 @@ static int alloc_l3_table(struct page_in
d->vcpu[0] && d->vcpu[0]->is_initialised )
{
MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
- return 0;
+ return -EINVAL;
}
#endif
@@ -1197,64 +1216,96 @@ static int alloc_l3_table(struct page_in
if ( is_pv_32on64_domain(d) )
memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
- for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
+ for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; i++ )
{
if ( is_pv_32bit_domain(d) && (i == 3) )
{
if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
- (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ||
- !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
- PGT_l2_page_table |
- PGT_pae_xen_l2,
- d) )
- goto fail;
+ (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
+ rc = -EINVAL;
+ else
+ rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
+ PGT_l2_page_table |
+ PGT_pae_xen_l2,
+ d, preemptible);
}
- else if ( !is_guest_l3_slot(i) )
+ else if ( !is_guest_l3_slot(i) ||
+ (rc = get_page_from_l3e(pl3e[i], pfn, d, preemptible)) > 0 )
continue;
- else if ( unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
- goto fail;
+
+ if ( rc == -EAGAIN )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_pte = 1;
+ }
+ else if ( rc == -EINTR && i )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_pte = 0;
+ rc = -EAGAIN;
+ }
+ if ( rc < 0 )
+ break;
adjust_guest_l3e(pl3e[i], d);
}
- if ( !create_pae_xen_mappings(d, pl3e) )
- goto fail;
-
- unmap_domain_page(pl3e);
- return 1;
-
- fail:
- MEM_LOG("Failure in alloc_l3_table: entry %d", i);
- while ( i-- > 0 )
+ if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
+ rc = -EINVAL;
+ if ( rc < 0 && rc != -EAGAIN && rc != -EINTR )
{
- if ( !is_guest_l3_slot(i) )
- continue;
- unadjust_guest_l3e(pl3e[i], d);
- put_page_from_l3e(pl3e[i], pfn);
+ MEM_LOG("Failure in alloc_l3_table: entry %d", i);
+ while ( i-- > 0 )
+ {
+ if ( !is_guest_l3_slot(i) )
+ continue;
+ unadjust_guest_l3e(pl3e[i], d);
+ put_page_from_l3e(pl3e[i], pfn, 0);
+ }
}
unmap_domain_page(pl3e);
- return 0;
+ return rc > 0 ? 0 : rc;
}
-#else
-#define alloc_l3_table(page) (0)
-#endif
#if CONFIG_PAGING_LEVELS >= 4
-static int alloc_l4_table(struct page_info *page)
+static int alloc_l4_table(struct page_info *page, int preemptible)
{
struct domain *d = page_get_owner(page);
unsigned long pfn = page_to_mfn(page);
l4_pgentry_t *pl4e = page_to_virt(page);
- int i;
+ unsigned int i;
+ int rc = 0;
- for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
+ for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; i++ )
{
- if ( !is_guest_l4_slot(d, i) )
+ if ( !is_guest_l4_slot(d, i) ||
+ (rc = get_page_from_l4e(pl4e[i], pfn, d, preemptible)) > 0 )
continue;
- if ( unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
- goto fail;
+ if ( rc == -EAGAIN )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_pte = 1;
+ }
+ else if ( rc == -EINTR )
+ {
+ if ( i )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_pte = 0;
+ rc = -EAGAIN;
+ }
+ }
+ else if ( rc < 0 )
+ {
+ MEM_LOG("Failure in alloc_l4_table: entry %d", i);
+ while ( i-- > 0 )
+ if ( is_guest_l4_slot(d, i) )
+ put_page_from_l4e(pl4e[i], pfn, 0);
+ }
+ if ( rc < 0 )
+ return rc;
adjust_guest_l4e(pl4e[i], d);
}
@@ -1269,18 +1320,10 @@ static int alloc_l4_table(struct page_in
l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
__PAGE_HYPERVISOR);
- return 1;
-
- fail:
- MEM_LOG("Failure in alloc_l4_table: entry %d", i);
- while ( i-- > 0 )
- if ( is_guest_l4_slot(d, i) )
- put_page_from_l4e(pl4e[i], pfn);
-
- return 0;
+ return rc > 0 ? 0 : rc;
}
#else
-#define alloc_l4_table(page) (0)
+#define alloc_l4_table(page, preemptible) (-EINVAL)
#endif
@@ -1289,7 +1332,7 @@ static void free_l1_table(struct page_in
struct domain *d = page_get_owner(page);
unsigned long pfn = page_to_mfn(page);
l1_pgentry_t *pl1e;
- int i;
+ unsigned int i;
pl1e = map_domain_page(pfn);
@@ -1301,74 +1344,114 @@ static void free_l1_table(struct page_in
}
-static void free_l2_table(struct page_info *page)
+static int free_l2_table(struct page_info *page, int preemptible)
{
#ifdef CONFIG_COMPAT
struct domain *d = page_get_owner(page);
#endif
unsigned long pfn = page_to_mfn(page);
l2_pgentry_t *pl2e;
- int i;
+ unsigned int i = page->nr_validated_ptes - 1;
+ int err = 0;
pl2e = map_domain_page(pfn);
- for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
- if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
- put_page_from_l2e(pl2e[i], pfn);
+ ASSERT(page->nr_validated_ptes);
+ do {
+ if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
+ put_page_from_l2e(pl2e[i], pfn) == 0 &&
+ preemptible && i && hypercall_preempt_check() )
+ {
+ page->nr_validated_ptes = i;
+ err = -EAGAIN;
+ }
+ } while ( !err && i-- );
unmap_domain_page(pl2e);
- page->u.inuse.type_info &= ~PGT_pae_xen_l2;
-}
-
+ if ( !err )
+ page->u.inuse.type_info &= ~PGT_pae_xen_l2;
-#if CONFIG_PAGING_LEVELS >= 3
+ return err;
+}
-static void free_l3_table(struct page_info *page)
+static int free_l3_table(struct page_info *page, int preemptible)
{
struct domain *d = page_get_owner(page);
unsigned long pfn = page_to_mfn(page);
l3_pgentry_t *pl3e;
- int i;
+ unsigned int i = page->nr_validated_ptes - !page->partial_pte;
+ int rc = 0;
#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
if ( d->arch.relmem == RELMEM_l3 )
- return;
+ return 0;
#endif
pl3e = map_domain_page(pfn);
- for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
+ do {
if ( is_guest_l3_slot(i) )
{
- put_page_from_l3e(pl3e[i], pfn);
+ rc = put_page_from_l3e(pl3e[i], pfn, preemptible);
+ if ( rc > 0 )
+ continue;
+ if ( rc )
+ break;
unadjust_guest_l3e(pl3e[i], d);
}
+ } while ( i-- );
unmap_domain_page(pl3e);
-}
-#endif
+ if ( rc == -EAGAIN )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_pte = 1;
+ }
+ else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
+ {
+ page->nr_validated_ptes = i + 1;
+ page->partial_pte = 0;
+ rc = -EAGAIN;
+ }
+ return rc > 0 ? 0 : rc;
+}
#if CONFIG_PAGING_LEVELS >= 4
-
-static void free_l4_table(struct page_info *page)
+static int free_l4_table(struct page_info *page, int preemptible)
{
struct domain *d = page_get_owner(page);
unsigned long pfn = page_to_mfn(page);
l4_pgentry_t *pl4e = page_to_virt(page);
- int i;
+ unsigned int i = page->nr_validated_ptes - !page->partial_pte;
+ int rc = 0;
#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
if ( d->arch.relmem == RELMEM_l4 )
- return;
+ return 0;
#endif
- for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
+ do {
if ( is_guest_l4_slot(d, i) )
- put_page_from_l4e(pl4e[i], pfn);
-}
+ rc = put_page_from_l4e(pl4e[i], pfn, preemptible);
+ } while ( rc >= 0 && i-- );
+ if ( rc == -EAGAIN )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_pte = 1;
+ }
+ else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
+ {
+ page->nr_validated_ptes = i + 1;
+ page->partial_pte = 0;
+ rc = -EAGAIN;
+ }
+ return rc > 0 ? 0 : rc;
+}
+#else
+#define free_l4_table(page, preemptible) (-EINVAL)
#endif
static void page_lock(struct page_info *page)
@@ -1560,7 +1643,7 @@ static int mod_l2_entry(l2_pgentry_t *pl
return rc;
}
- if ( unlikely(!get_page_from_l2e(nl2e, pfn, d)) )
+ if ( unlikely(get_page_from_l2e(nl2e, pfn, d) < 0) )
return page_unlock(l2pg), 0;
adjust_guest_l2e(nl2e, d);
@@ -1583,24 +1666,23 @@ static int mod_l2_entry(l2_pgentry_t *pl
return rc;
}
-#if CONFIG_PAGING_LEVELS >= 3
-
/* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
static int mod_l3_entry(l3_pgentry_t *pl3e,
l3_pgentry_t nl3e,
unsigned long pfn,
- int preserve_ad)
+ int preserve_ad,
+ int preemptible)
{
l3_pgentry_t ol3e;
struct vcpu *curr = current;
struct domain *d = curr->domain;
struct page_info *l3pg = mfn_to_page(pfn);
- int rc = 1;
+ int rc = 0;
if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
{
MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
- return 0;
+ return -EINVAL;
}
/*
@@ -1608,12 +1690,12 @@ static int mod_l3_entry(l3_pgentry_t *pl
* would be a pain to ensure they remain continuously valid throughout.
*/
if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
- return 0;
+ return -EINVAL;
page_lock(l3pg);
if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
- return page_unlock(l3pg), 0;
+ return page_unlock(l3pg), -EFAULT;
if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
{
@@ -1622,7 +1704,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
page_unlock(l3pg);
MEM_LOG("Bad L3 flags %x",
l3e_get_flags(nl3e) & l3_disallow_mask(d));
- return 0;
+ return -EINVAL;
}
/* Fast path for identical mapping and presence. */
@@ -1631,28 +1713,30 @@ static int mod_l3_entry(l3_pgentry_t *pl
adjust_guest_l3e(nl3e, d);
rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad);
page_unlock(l3pg);
- return rc;
+ return rc ? 0 : -EFAULT;
}
- if ( unlikely(!get_page_from_l3e(nl3e, pfn, d)) )
- return page_unlock(l3pg), 0;
+ rc = get_page_from_l3e(nl3e, pfn, d, preemptible);
+ if ( unlikely(rc < 0) )
+ return page_unlock(l3pg), rc;
+ rc = 0;
adjust_guest_l3e(nl3e, d);
if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
preserve_ad)) )
{
ol3e = nl3e;
- rc = 0;
+ rc = -EFAULT;
}
}
else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
preserve_ad)) )
{
page_unlock(l3pg);
- return 0;
+ return -EFAULT;
}
- if ( likely(rc) )
+ if ( likely(rc == 0) )
{
if ( !create_pae_xen_mappings(d, pl3e) )
BUG();
@@ -1661,36 +1745,35 @@ static int mod_l3_entry(l3_pgentry_t *pl
}
page_unlock(l3pg);
- put_page_from_l3e(ol3e, pfn);
+ put_page_from_l3e(ol3e, pfn, 0);
return rc;
}
-#endif
-
#if CONFIG_PAGING_LEVELS >= 4
/* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
static int mod_l4_entry(l4_pgentry_t *pl4e,
l4_pgentry_t nl4e,
unsigned long pfn,
- int preserve_ad)
+ int preserve_ad,
+ int preemptible)
{
struct vcpu *curr = current;
struct domain *d = curr->domain;
l4_pgentry_t ol4e;
struct page_info *l4pg = mfn_to_page(pfn);
- int rc = 1;
+ int rc = 0;
if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
{
MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
- return 0;
+ return -EINVAL;
}
page_lock(l4pg);
if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
- return page_unlock(l4pg), 0;
+ return page_unlock(l4pg), -EFAULT;
if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
{
@@ -1699,7 +1782,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
page_unlock(l4pg);
MEM_LOG("Bad L4 flags %x",
l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
- return 0;
+ return -EINVAL;
}
/* Fast path for identical mapping and presence. */
@@ -1708,29 +1791,31 @@ static int mod_l4_entry(l4_pgentry_t *pl
adjust_guest_l4e(nl4e, d);
rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad);
page_unlock(l4pg);
- return rc;
+ return rc ? 0 : -EFAULT;
}
- if ( unlikely(!get_page_from_l4e(nl4e, pfn, d)) )
- return page_unlock(l4pg), 0;
+ rc = get_page_from_l4e(nl4e, pfn, d, preemptible);
+ if ( unlikely(rc < 0) )
+ return page_unlock(l4pg), rc;
+ rc = 0;
adjust_guest_l4e(nl4e, d);
if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
preserve_ad)) )
{
ol4e = nl4e;
- rc = 0;
+ rc = -EFAULT;
}
}
else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
preserve_ad)) )
{
page_unlock(l4pg);
- return 0;
+ return -EFAULT;
}
page_unlock(l4pg);
- put_page_from_l4e(ol4e, pfn);
+ put_page_from_l4e(ol4e, pfn, 0);
return rc;
}
@@ -1788,9 +1873,11 @@ int get_page(struct page_info *page, str
}
-static int alloc_page_type(struct page_info *page, unsigned long type)
+static int alloc_page_type(struct page_info *page, unsigned long type,
+ int preemptible)
{
struct domain *owner = page_get_owner(page);
+ int rc;
/* A page table is dirtied when its type count becomes non-zero. */
if ( likely(owner != NULL) )
@@ -1799,30 +1886,65 @@ static int alloc_page_type(struct page_i
switch ( type & PGT_type_mask )
{
case PGT_l1_page_table:
- return alloc_l1_table(page);
+ alloc_l1_table(page);
+ rc = 0;
+ break;
case PGT_l2_page_table:
- return alloc_l2_table(page, type);
+ rc = alloc_l2_table(page, type, preemptible);
+ break;
case PGT_l3_page_table:
- return alloc_l3_table(page);
+ rc = alloc_l3_table(page, preemptible);
+ break;
case PGT_l4_page_table:
- return alloc_l4_table(page);
+ rc = alloc_l4_table(page, preemptible);
+ break;
case PGT_seg_desc_page:
- return alloc_segdesc_page(page);
+ rc = alloc_segdesc_page(page);
+ break;
default:
printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
type, page->u.inuse.type_info,
page->count_info);
+ rc = -EINVAL;
BUG();
}
- return 0;
+ /* No need for atomic update of type_info here: noone else updates it. */
+ wmb();
+ if ( rc == -EAGAIN )
+ {
+ page->u.inuse.type_info |= PGT_partial;
+ }
+ else if ( rc == -EINTR )
+ {
+ ASSERT((page->u.inuse.type_info &
+ (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
+ page->u.inuse.type_info &= ~PGT_count_mask;
+ }
+ else if ( rc )
+ {
+ ASSERT(rc < 0);
+ MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
+ PRtype_info ": caf=%08x taf=%" PRtype_info,
+ page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
+ type, page->count_info, page->u.inuse.type_info);
+ page->u.inuse.type_info = 0;
+ }
+ else
+ {
+ page->u.inuse.type_info |= PGT_validated;
+ }
+
+ return rc;
}
-void free_page_type(struct page_info *page, unsigned long type)
+int free_page_type(struct page_info *page, unsigned long type,
+ int preemptible)
{
struct domain *owner = page_get_owner(page);
unsigned long gmfn;
+ int rc;
if ( likely(owner != NULL) )
{
@@ -1842,7 +1964,7 @@ void free_page_type(struct page_info *pa
paging_mark_dirty(owner, page_to_mfn(page));
if ( shadow_mode_refcounts(owner) )
- return;
+ return 0;
gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
ASSERT(VALID_M2P(gmfn));
@@ -1850,42 +1972,80 @@ void free_page_type(struct page_info *pa
}
}
+ if ( !(type & PGT_partial) )
+ {
+ page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
+ page->partial_pte = 0;
+ }
switch ( type & PGT_type_mask )
{
case PGT_l1_page_table:
free_l1_table(page);
+ rc = 0;
break;
-
case PGT_l2_page_table:
- free_l2_table(page);
+ rc = free_l2_table(page, preemptible);
break;
-
-#if CONFIG_PAGING_LEVELS >= 3
case PGT_l3_page_table:
- free_l3_table(page);
- break;
+#if CONFIG_PAGING_LEVELS == 3
+ if ( !(type & PGT_partial) )
+ page->nr_validated_ptes = L3_PAGETABLE_ENTRIES;
#endif
-
-#if CONFIG_PAGING_LEVELS >= 4
+ rc = free_l3_table(page, preemptible);
+ break;
case PGT_l4_page_table:
- free_l4_table(page);
+ rc = free_l4_table(page, preemptible);
break;
-#endif
-
default:
- printk("%s: type %lx pfn %lx\n",__FUNCTION__,
- type, page_to_mfn(page));
+ MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page));
+ rc = -EINVAL;
BUG();
}
+
+ /* No need for atomic update of type_info here: noone else updates it. */
+ if ( rc == 0 )
+ {
+ /*
+ * Record TLB information for flush later. We do not stamp page tables
+ * when running in shadow mode:
+ * 1. Pointless, since it's the shadow pt's which must be tracked.
+ * 2. Shadow mode reuses this field for shadowed page tables to
+ * store flags info -- we don't want to conflict with that.
+ */
+ if ( !(shadow_mode_enabled(page_get_owner(page)) &&
+ (page->count_info & PGC_page_table)) )
+ page->tlbflush_timestamp = tlbflush_current_time();
+ wmb();
+ page->u.inuse.type_info--;
+ }
+ else if ( rc == -EINTR )
+ {
+ ASSERT(!(page->u.inuse.type_info &
+ (PGT_count_mask|PGT_validated|PGT_partial)));
+ if ( !(shadow_mode_enabled(page_get_owner(page)) &&
+ (page->count_info & PGC_page_table)) )
+ page->tlbflush_timestamp = tlbflush_current_time();
+ wmb();
+ page->u.inuse.type_info |= PGT_validated;
+ }
+ else
+ {
+ BUG_ON(rc != -EAGAIN);
+ wmb();
+ page->u.inuse.type_info |= PGT_partial;
+ }
+
+ return rc;
}
-void put_page_type(struct page_info *page)
+static int __put_page_type(struct page_info *page,
+ int preemptible)
{
unsigned long nx, x, y = page->u.inuse.type_info;
- again:
- do {
+ for ( ; ; )
+ {
x = y;
nx = x - 1;
@@ -1894,21 +2054,19 @@ void put_page_type(struct page_info *pag
if ( unlikely((nx & PGT_count_mask) == 0) )
{
if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
- likely(nx & PGT_validated) )
+ likely(nx & (PGT_validated|PGT_partial)) )
{
/*
* Page-table pages must be unvalidated when count is zero. The
* 'free' is safe because the refcnt is non-zero and validated
* bit is clear => other ops will spin or fail.
*/
- if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
- x & ~PGT_validated)) != x) )
- goto again;
+ nx = x & ~(PGT_validated|PGT_partial);
+ if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
+ x, nx)) != x) )
+ continue;
/* We cleared the 'valid bit' so we do the clean up. */
- free_page_type(page, x);
- /* Carry on, but with the 'valid bit' now clear. */
- x &= ~PGT_validated;
- nx &= ~PGT_validated;
+ return free_page_type(page, x, preemptible);
}
/*
@@ -1922,25 +2080,33 @@ void put_page_type(struct page_info *pag
(page->count_info & PGC_page_table)) )
page->tlbflush_timestamp = tlbflush_current_time();
}
+
+ if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
+ break;
+
+ if ( preemptible && hypercall_preempt_check() )
+ return -EINTR;
}
- while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
+
+ return 0;
}
-int get_page_type(struct page_info *page, unsigned long type)
+static int __get_page_type(struct page_info *page, unsigned long type,
+ int preemptible)
{
unsigned long nx, x, y = page->u.inuse.type_info;
ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
- again:
- do {
+ for ( ; ; )
+ {
x = y;
nx = x + 1;
if ( unlikely((nx & PGT_count_mask) == 0) )
{
MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
- return 0;
+ return -EINVAL;
}
else if ( unlikely((x & PGT_count_mask) == 0) )
{
@@ -1993,28 +2159,43 @@ int get_page_type(struct page_info *page
/* Don't log failure if it could be a recursive-mapping attempt. */
if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
(type == PGT_l1_page_table) )
- return 0;
+ return -EINVAL;
if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
(type == PGT_l2_page_table) )
- return 0;
+ return -EINVAL;
if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
(type == PGT_l3_page_table) )
- return 0;
+ return -EINVAL;
MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
"for mfn %lx (pfn %lx)",
x, type, page_to_mfn(page),
get_gpfn_from_mfn(page_to_mfn(page)));
- return 0;
+ return -EINVAL;
}
else if ( unlikely(!(x & PGT_validated)) )
{
- /* Someone else is updating validation of this page. Wait... */
- while ( (y = page->u.inuse.type_info) == x )
- cpu_relax();
- goto again;
+ if ( !(x & PGT_partial) )
+ {
+ /* Someone else is updating validation of this page. Wait... */
+ while ( (y = page->u.inuse.type_info) == x )
+ {
+ if ( preemptible && hypercall_preempt_check() )
+ return -EINTR;
+ cpu_relax();
+ }
+ continue;
+ }
+ /* Type ref count was left at 1 when PGT_partial got set. */
+ ASSERT((x & PGT_count_mask) == 1);
+ nx = x & ~PGT_partial;
}
+
+ if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
+ break;
+
+ if ( preemptible && hypercall_preempt_check() )
+ return -EINTR;
}
- while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
if ( unlikely((x & PGT_type_mask) != type) )
{
@@ -2032,25 +2213,42 @@ int get_page_type(struct page_info *page
if ( unlikely(!(nx & PGT_validated)) )
{
- /* Try to validate page type; drop the new reference on failure. */
- if ( unlikely(!alloc_page_type(page, type)) )
+ if ( !(x & PGT_partial) )
{
- MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
- PRtype_info ": caf=%08x taf=%" PRtype_info,
- page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
- type, page->count_info, page->u.inuse.type_info);
- /* Noone else can get a reference. We hold the only ref. */
- page->u.inuse.type_info = 0;
- return 0;
+ page->nr_validated_ptes = 0;
+ page->partial_pte = 0;
}
-
- /* Noone else is updating simultaneously. */
- __set_bit(_PGT_validated, &page->u.inuse.type_info);
+ return alloc_page_type(page, type, preemptible);
}
- return 1;
+ return 0;
}
+void put_page_type(struct page_info *page)
+{
+ int rc = __put_page_type(page, 0);
+ ASSERT(rc == 0);
+ (void)rc;
+}
+
+int get_page_type(struct page_info *page, unsigned long type)
+{
+ int rc = __get_page_type(page, type, 0);
+ if ( likely(rc == 0) )
+ return 1;
+ ASSERT(rc == -EINVAL);
+ return 0;
+}
+
+int put_page_type_preemptible(struct page_info *page)
+{
+ return __put_page_type(page, 1);
+}
+
+int get_page_type_preemptible(struct page_info *page, unsigned long type)
+{
+ return __get_page_type(page, type, 1);
+}
void cleanup_page_cacheattr(struct page_info *page)
{
@@ -2087,7 +2285,7 @@ int new_guest_cr3(unsigned long mfn)
l4e_from_pfn(
mfn,
(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
- pagetable_get_pfn(v->arch.guest_table), 0);
+ pagetable_get_pfn(v->arch.guest_table), 0, 0) == 0;
if ( unlikely(!okay) )
{
MEM_LOG("Error while installing new compat baseptr %lx", mfn);
@@ -2102,7 +2300,7 @@ int new_guest_cr3(unsigned long mfn)
#endif
okay = paging_mode_refcounts(d)
? get_page_from_pagenr(mfn, d)
- : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
+ : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0);
if ( unlikely(!okay) )
{
MEM_LOG("Error while installing new baseptr %lx", mfn);
@@ -2276,9 +2474,7 @@ int do_mmuext_op(
{
if ( hypercall_preempt_check() )
{
- rc = hypercall_create_continuation(
- __HYPERVISOR_mmuext_op, "hihi",
- uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+ rc = -EAGAIN;
break;
}
@@ -2325,10 +2521,14 @@ int do_mmuext_op(
if ( paging_mode_refcounts(FOREIGNDOM) )
break;
- okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
+ rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 1);
+ okay = !rc;
if ( unlikely(!okay) )
{
- MEM_LOG("Error while pinning mfn %lx", mfn);
+ if ( rc == -EINTR )
+ rc = -EAGAIN;
+ else if ( rc != -EAGAIN )
+ MEM_LOG("Error while pinning mfn %lx", mfn);
break;
}
@@ -2373,8 +2573,11 @@ int do_mmuext_op(
{
put_page_and_type(page);
put_page(page);
- /* A page is dirtied when its pin status is cleared. */
- paging_mark_dirty(d, mfn);
+ if ( !rc )
+ {
+ /* A page is dirtied when its pin status is cleared. */
+ paging_mark_dirty(d, mfn);
+ }
}
else
{
@@ -2398,8 +2601,8 @@ int do_mmuext_op(
if ( paging_mode_refcounts(d) )
okay = get_page_from_pagenr(mfn, d);
else
- okay = get_page_and_type_from_pagenr(
- mfn, PGT_root_page_table, d);
+ okay = !get_page_and_type_from_pagenr(
+ mfn, PGT_root_page_table, d, 0);
if ( unlikely(!okay) )
{
MEM_LOG("Error while installing new mfn %lx", mfn);
@@ -2517,6 +2720,11 @@ int do_mmuext_op(
guest_handle_add_offset(uops, 1);
}
+ if ( rc == -EAGAIN )
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmuext_op, "hihi",
+ uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+
process_deferred_ops();
perfc_add(num_mmuext_ops, i);
@@ -2576,9 +2784,7 @@ int do_mmu_update(
{
if ( hypercall_preempt_check() )
{
- rc = hypercall_create_continuation(
- __HYPERVISOR_mmu_update, "hihi",
- ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+ rc = -EAGAIN;
break;
}
@@ -2653,27 +2859,29 @@ int do_mmu_update(
cmd == MMU_PT_UPDATE_PRESERVE_AD);
}
break;
-#if CONFIG_PAGING_LEVELS >= 3
case PGT_l3_page_table:
{
l3_pgentry_t l3e = l3e_from_intpte(req.val);
- okay = mod_l3_entry(va, l3e, mfn,
- cmd == MMU_PT_UPDATE_PRESERVE_AD);
+ rc = mod_l3_entry(va, l3e, mfn,
+ cmd == MMU_PT_UPDATE_PRESERVE_AD, 1);
+ okay = !rc;
}
break;
-#endif
#if CONFIG_PAGING_LEVELS >= 4
case PGT_l4_page_table:
{
l4_pgentry_t l4e = l4e_from_intpte(req.val);
- okay = mod_l4_entry(va, l4e, mfn,
- cmd == MMU_PT_UPDATE_PRESERVE_AD);
+ rc = mod_l4_entry(va, l4e, mfn,
+ cmd == MMU_PT_UPDATE_PRESERVE_AD, 1);
+ okay = !rc;
}
break;
#endif
}
put_page_type(page);
+ if ( rc == -EINTR )
+ rc = -EAGAIN;
}
break;
@@ -2742,6 +2950,11 @@ int do_mmu_update(
guest_handle_add_offset(ureqs, 1);
}
+ if ( rc == -EAGAIN )
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmu_update, "hihi",
+ ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+
process_deferred_ops();
domain_mmap_cache_destroy(&mapcache);
@@ -3617,9 +3830,8 @@ static int ptwr_emulated_update(
nl1e = l1e_from_intpte(val);
if ( unlikely(!get_page_from_l1e(nl1e, d)) )
{
- if ( (CONFIG_PAGING_LEVELS >= 3) && is_pv_32bit_domain(d) &&
- (bytes == 4) && (unaligned_addr & 4) && !do_cmpxchg &&
- (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
+ if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) &&
+ !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
{
/*
* If this is an upper-half write to a PAE PTE then we assume that
Index: xen-3.3.1-testing/xen/include/asm-x86/mm.h
===================================================================
--- xen-3.3.1-testing.orig/xen/include/asm-x86/mm.h
+++ xen-3.3.1-testing/xen/include/asm-x86/mm.h
@@ -59,6 +59,17 @@ struct page_info
u32 tlbflush_timestamp;
/*
+ * When PGT_partial is true then this field is valid and indicates
+ * that PTEs in the range [0, @nr_validated_ptes) have been validated.
+ * If @partial_pte is true then PTE at @nr_validated_ptes+1 has been
+ * partially validated.
+ */
+ struct {
+ u16 nr_validated_ptes;
+ bool_t partial_pte;
+ };
+
+ /*
* Guest pages with a shadow. This does not conflict with
* tlbflush_timestamp since page table pages are explicitly not
* tracked for TLB-flush avoidance when a guest runs in shadow mode.
@@ -86,9 +97,12 @@ struct page_info
/* PAE only: is this an L2 page directory containing Xen-private mappings? */
#define _PGT_pae_xen_l2 26
#define PGT_pae_xen_l2 (1U<<_PGT_pae_xen_l2)
+/* Has this page been *partially* validated for use as its current type? */
+#define _PGT_partial 25
+#define PGT_partial (1U<<_PGT_partial)
- /* 26-bit count of uses of this frame as its current type. */
-#define PGT_count_mask ((1U<<26)-1)
+ /* 25-bit count of uses of this frame as its current type. */
+#define PGT_count_mask ((1U<<25)-1)
/* Cleared when the owning guest 'frees' this page. */
#define _PGC_allocated 31
@@ -154,7 +168,8 @@ extern unsigned long max_page;
extern unsigned long total_pages;
void init_frametable(void);
-void free_page_type(struct page_info *page, unsigned long type);
+int free_page_type(struct page_info *page, unsigned long type,
+ int preemptible);
int _shadow_mode_refcounts(struct domain *d);
void cleanup_page_cacheattr(struct page_info *page);
@@ -165,6 +180,8 @@ void put_page(struct page_info *page);
int get_page(struct page_info *page, struct domain *domain);
void put_page_type(struct page_info *page);
int get_page_type(struct page_info *page, unsigned long type);
+int put_page_type_preemptible(struct page_info *page);
+int get_page_type_preemptible(struct page_info *page, unsigned long type);
int get_page_from_l1e(l1_pgentry_t l1e, struct domain *d);
void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d);
@@ -174,6 +191,19 @@ static inline void put_page_and_type(str
put_page(page);
}
+static inline int put_page_and_type_preemptible(struct page_info *page,
+ int preemptible)
+{
+ int rc = 0;
+
+ if ( preemptible )
+ rc = put_page_type_preemptible(page);
+ else
+ put_page_type(page);
+ if ( likely(rc == 0) )
+ put_page(page);
+ return rc;
+}
static inline int get_page_and_type(struct page_info *page,
struct domain *domain,