diff --git a/18720-x86-dom-cleanup.patch b/18720-x86-dom-cleanup.patch new file mode 100644 index 0000000..72f6fe9 --- /dev/null +++ b/18720-x86-dom-cleanup.patch @@ -0,0 +1,75 @@ +# HG changeset patch +# User Keir Fraser +# Date 1225113763 0 +# Node ID 11c86c51a697dab2e4a49efe3dda139ea206f423 +# Parent 101e50cffc7825065f4dd39610728a2ba3ea68b4 +x86: fix domain cleanup + +The preemptable page type handling changes modified free_page_type() +behavior without adjusting the call site in relinquish_memory(): Any +type reference left pending when leaving hypercall handlers is +associated with a page reference, and when successful free_page_type() +decrements the type refcount - hence relinquish_memory() must now also +drop the page reference. + +Also, the recursion avoidance during domain shutdown somehow (probably +by me when I merged the patch up to a newer snapshot) got screwed up: +The avoidance logic in mm.c should short circuit levels below the top +one currently being processed, rather than the top one itself. + +Signed-off-by: Jan Beulich + +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -1687,6 +1687,7 @@ static int relinquish_memory( + { + if ( free_page_type(page, x, 0) != 0 ) + BUG(); ++ put_page(page); + break; + } + } +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1343,7 +1343,7 @@ static void free_l1_table(struct page_in + + static int free_l2_table(struct page_info *page, int preemptible) + { +-#ifdef CONFIG_COMPAT ++#if defined(CONFIG_COMPAT) || defined(DOMAIN_DESTRUCT_AVOID_RECURSION) + struct domain *d = page_get_owner(page); + #endif + unsigned long pfn = page_to_mfn(page); +@@ -1351,6 +1351,11 @@ static int free_l2_table(struct page_inf + unsigned int i = page->nr_validated_ptes - 1; + int err = 0; + ++#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION ++ if ( d->arch.relmem == RELMEM_l3 ) ++ return 0; ++#endif ++ + pl2e = map_domain_page(pfn); + + ASSERT(page->nr_validated_ptes); +@@ -1381,7 +1386,7 @@ static int free_l3_table(struct page_inf + int rc = 0; + + #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION +- if ( d->arch.relmem == RELMEM_l3 ) ++ if ( d->arch.relmem == RELMEM_l4 ) + return 0; + #endif + +@@ -1424,11 +1429,6 @@ static int free_l4_table(struct page_inf + unsigned int i = page->nr_validated_ptes - !page->partial_pte; + int rc = 0; + +-#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION +- if ( d->arch.relmem == RELMEM_l4 ) +- return 0; +-#endif +- + do { + if ( is_guest_l4_slot(d, i) ) + rc = put_page_from_l4e(pl4e[i], pfn, preemptible); diff --git a/18722-x86-fixmap-reserved.patch b/18722-x86-fixmap-reserved.patch new file mode 100644 index 0000000..32fa665 --- /dev/null +++ b/18722-x86-fixmap-reserved.patch @@ -0,0 +1,19 @@ +# HG changeset patch +# User Keir Fraser +# Date 1225114010 0 +# Node ID 604ffa3cdcc48bbfcfe5e4ccd0af735ddc49d839 +# Parent 15aed96c7b5cd5a435754a57db13cd72b386717a +x86: First fixmap entry (0) is invalid. +Signed-off-by: Jan Beulich +Signed-off-by: Keir Fraser + +--- a/xen/include/asm-x86/fixmap.h ++++ b/xen/include/asm-x86/fixmap.h +@@ -29,6 +29,7 @@ + * from the end of virtual memory backwards. + */ + enum fixed_addresses { ++ FIX_RESERVED, /* Index 0 is reserved since fix_to_virt(0) > FIXADDR_TOP. */ + #ifdef __i386__ + FIX_PAE_HIGHMEM_0, + FIX_PAE_HIGHMEM_END = FIX_PAE_HIGHMEM_0 + NR_CPUS-1, diff --git a/18723-unmap-dom-page-const.patch b/18723-unmap-dom-page-const.patch new file mode 100644 index 0000000..35018be --- /dev/null +++ b/18723-unmap-dom-page-const.patch @@ -0,0 +1,58 @@ +# HG changeset patch +# User Keir Fraser +# Date 1225114053 0 +# Node ID 9bbb54fd9181644d2bdd3c7f93c2cba1dac1b719 +# Parent 604ffa3cdcc48bbfcfe5e4ccd0af735ddc49d839 +Constify arguments to unmap_domain_page() etc. + +Signed-off-by: Jan Beulich + +--- a/xen/arch/x86/x86_32/domain_page.c ++++ b/xen/arch/x86/x86_32/domain_page.c +@@ -114,7 +114,7 @@ void *map_domain_page(unsigned long mfn) + return (void *)va; + } + +-void unmap_domain_page(void *va) ++void unmap_domain_page(const void *va) + { + unsigned int idx; + struct vcpu *v; +@@ -241,7 +241,7 @@ void *map_domain_page_global(unsigned lo + return (void *)va; + } + +-void unmap_domain_page_global(void *va) ++void unmap_domain_page_global(const void *va) + { + unsigned long __va = (unsigned long)va; + l2_pgentry_t *pl2e; +--- a/xen/include/xen/domain_page.h ++++ b/xen/include/xen/domain_page.h +@@ -24,7 +24,7 @@ void *map_domain_page(unsigned long mfn) + * Pass a VA within a page previously mapped in the context of the + * currently-executing VCPU via a call to map_domain_page(). + */ +-void unmap_domain_page(void *va); ++void unmap_domain_page(const void *va); + + /* + * Similar to the above calls, except the mapping is accessible in all +@@ -32,7 +32,7 @@ void unmap_domain_page(void *va); + * mappings can also be unmapped from any context. + */ + void *map_domain_page_global(unsigned long mfn); +-void unmap_domain_page_global(void *va); ++void unmap_domain_page_global(const void *va); + + #define DMCACHE_ENTRY_VALID 1U + #define DMCACHE_ENTRY_HELD 2U +@@ -75,7 +75,7 @@ map_domain_page_with_cache(unsigned long + } + + static inline void +-unmap_domain_page_with_cache(void *va, struct domain_mmap_cache *cache) ++unmap_domain_page_with_cache(const void *va, struct domain_mmap_cache *cache) + { + ASSERT(cache != NULL); + cache->flags &= ~DMCACHE_ENTRY_HELD; diff --git a/i386-highmem-assist.patch b/18724-i386-highmem-assist.patch similarity index 59% rename from i386-highmem-assist.patch rename to 18724-i386-highmem-assist.patch index 5bcd657..15891cd 100644 --- a/i386-highmem-assist.patch +++ b/18724-i386-highmem-assist.patch @@ -1,9 +1,27 @@ --unstable staging c/s 18724 +# HG changeset patch +# User Keir Fraser +# Date 1225114175 0 +# Node ID 4413d53a8320809e93142ed599a81e1bfe5ae900 +# Parent 9bbb54fd9181644d2bdd3c7f93c2cba1dac1b719 +x86: highmem handling assistance hypercalls -Index: xen-3.3.1-testing/xen/arch/x86/mm.c -=================================================================== ---- xen-3.3.1-testing.orig/xen/arch/x86/mm.c -+++ xen-3.3.1-testing/xen/arch/x86/mm.c +While looking at the origin of very frequently executed hypercalls I +realized that the high page accessor functions in Linux would be good +candidates to handle in the hypervisor - clearing or copying to/from +a high page is a pretty frequent operation (provided there's enough +memory in the domain). While prior to the first submission I only +measured kernel builds (where the results are not hinting at a +meaningful improvement), I now found time to do a more specific +analysis: page clearing is being improved by about 20%, page copying +doesn't seem to significantly benefit (though that may be an effect of +the simplistic copy_page() implementation Xen currently uses) - +nevertheless I would think that if one function is supported by the +hypervisor, then the other should also be. + +Signed-off-by: Jan Beulich + +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c @@ -2431,6 +2431,29 @@ static inline cpumask_t vcpumask_to_pcpu return pmask; } @@ -101,32 +119,8 @@ Index: xen-3.3.1-testing/xen/arch/x86/mm.c default: MEM_LOG("Invalid extended pt command 0x%x", op.cmd); rc = -ENOSYS; -Index: xen-3.3.1-testing/xen/arch/x86/x86_32/domain_page.c -=================================================================== ---- xen-3.3.1-testing.orig/xen/arch/x86/x86_32/domain_page.c -+++ xen-3.3.1-testing/xen/arch/x86/x86_32/domain_page.c -@@ -114,7 +114,7 @@ void *map_domain_page(unsigned long mfn) - return (void *)va; - } - --void unmap_domain_page(void *va) -+void unmap_domain_page(const void *va) - { - unsigned int idx; - struct vcpu *v; -@@ -241,7 +241,7 @@ void *map_domain_page_global(unsigned lo - return (void *)va; - } - --void unmap_domain_page_global(void *va) -+void unmap_domain_page_global(const void *va) - { - unsigned long __va = (unsigned long)va; - l2_pgentry_t *pl2e; -Index: xen-3.3.1-testing/xen/arch/x86/x86_64/compat/mm.c -=================================================================== ---- xen-3.3.1-testing.orig/xen/arch/x86/x86_64/compat/mm.c -+++ xen-3.3.1-testing/xen/arch/x86/x86_64/compat/mm.c +--- a/xen/arch/x86/x86_64/compat/mm.c ++++ b/xen/arch/x86/x86_64/compat/mm.c @@ -217,6 +217,8 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mm case MMUEXT_PIN_L4_TABLE: case MMUEXT_UNPIN_TABLE: @@ -146,36 +140,30 @@ Index: xen-3.3.1-testing/xen/arch/x86/x86_64/compat/mm.c default: arg2 = -1; break; -Index: xen-3.3.1-testing/xen/include/asm-x86/fixmap.h -=================================================================== ---- xen-3.3.1-testing.orig/xen/include/asm-x86/fixmap.h -+++ xen-3.3.1-testing/xen/include/asm-x86/fixmap.h -@@ -29,6 +29,7 @@ - * from the end of virtual memory backwards. - */ - enum fixed_addresses { -+ FIX_HOLE, - #ifdef __i386__ - FIX_PAE_HIGHMEM_0, - FIX_PAE_HIGHMEM_END = FIX_PAE_HIGHMEM_0 + NR_CPUS-1, -Index: xen-3.3.1-testing/xen/include/public/features.h -=================================================================== ---- xen-3.3.1-testing.orig/xen/include/public/features.h -+++ xen-3.3.1-testing/xen/include/public/features.h -@@ -65,6 +65,9 @@ - */ - #define XENFEAT_gnttab_map_avail_bits 7 +--- a/xen/common/kernel.c ++++ b/xen/common/kernel.c +@@ -222,6 +222,7 @@ DO(xen_version)(int cmd, XEN_GUEST_HANDL + #ifdef CONFIG_X86 + if ( !is_hvm_vcpu(current) ) + fi.submap |= (1U << XENFEAT_mmu_pt_update_preserve_ad) | ++ (1U << XENFEAT_highmem_assist) | + (1U << XENFEAT_gnttab_map_avail_bits); + #endif + break; +--- a/xen/include/public/features.h ++++ b/xen/include/public/features.h +@@ -59,6 +59,9 @@ + /* x86: Does this Xen host support the MMU_PT_UPDATE_PRESERVE_AD hypercall? */ + #define XENFEAT_mmu_pt_update_preserve_ad 5 +/* x86: Does this Xen host support the MMU_{CLEAR,COPY}_PAGE hypercall? */ +#define XENFEAT_highmem_assist 6 + - #define XENFEAT_NR_SUBMAPS 1 - - #endif /* __XEN_PUBLIC_FEATURES_H__ */ -Index: xen-3.3.1-testing/xen/include/public/xen.h -=================================================================== ---- xen-3.3.1-testing.orig/xen/include/public/xen.h -+++ xen-3.3.1-testing/xen/include/public/xen.h + /* + * If set, GNTTABOP_map_grant_ref honors flags to be placed into guest kernel + * available pte bits. +--- a/xen/include/public/xen.h ++++ b/xen/include/public/xen.h @@ -231,6 +231,13 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t); * cmd: MMUEXT_SET_LDT * linear_addr: Linear address of LDT base (NB. must be page-aligned). @@ -216,34 +204,3 @@ Index: xen-3.3.1-testing/xen/include/public/xen.h } arg2; }; typedef struct mmuext_op mmuext_op_t; -Index: xen-3.3.1-testing/xen/include/xen/domain_page.h -=================================================================== ---- xen-3.3.1-testing.orig/xen/include/xen/domain_page.h -+++ xen-3.3.1-testing/xen/include/xen/domain_page.h -@@ -24,7 +24,7 @@ void *map_domain_page(unsigned long mfn) - * Pass a VA within a page previously mapped in the context of the - * currently-executing VCPU via a call to map_domain_page(). - */ --void unmap_domain_page(void *va); -+void unmap_domain_page(const void *va); - - /* - * Similar to the above calls, except the mapping is accessible in all -@@ -32,7 +32,7 @@ void unmap_domain_page(void *va); - * mappings can also be unmapped from any context. - */ - void *map_domain_page_global(unsigned long mfn); --void unmap_domain_page_global(void *va); -+void unmap_domain_page_global(const void *va); - - #define DMCACHE_ENTRY_VALID 1U - #define DMCACHE_ENTRY_HELD 2U -@@ -75,7 +75,7 @@ map_domain_page_with_cache(unsigned long - } - - static inline void --unmap_domain_page_with_cache(void *va, struct domain_mmap_cache *cache) -+unmap_domain_page_with_cache(const void *va, struct domain_mmap_cache *cache) - { - ASSERT(cache != NULL); - cache->flags &= ~DMCACHE_ENTRY_HELD; diff --git a/18731-x86-dom-cleanup.patch b/18731-x86-dom-cleanup.patch new file mode 100644 index 0000000..757e140 --- /dev/null +++ b/18731-x86-dom-cleanup.patch @@ -0,0 +1,57 @@ +# HG changeset patch +# User Keir Fraser +# Date 1225193120 0 +# Node ID 19549b9766fdd68380ded8efd975c41269ab2801 +# Parent 2c20d026bb55722247c0d9ab81c125118a10346f +x86: Fix circular page reference destruction in relinquish_memory(). + +Tested by Jan Beulich and fixes a memory leak, but there is more to be +done here. + +Signed-off-by: Keir Fraser + +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -1687,7 +1687,6 @@ static int relinquish_memory( + { + if ( free_page_type(page, x, 0) != 0 ) + BUG(); +- put_page(page); + break; + } + } +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1973,6 +1973,7 @@ int free_page_type(struct page_info *pag + page->nr_validated_ptes = 1U << PAGETABLE_ORDER; + page->partial_pte = 0; + } ++ + switch ( type & PGT_type_mask ) + { + case PGT_l1_page_table: +@@ -1998,6 +1999,15 @@ int free_page_type(struct page_info *pag + BUG(); + } + ++ return rc; ++} ++ ++ ++static int __put_final_page_type( ++ struct page_info *page, unsigned long type, int preemptible) ++{ ++ int rc = free_page_type(page, type, preemptible); ++ + /* No need for atomic update of type_info here: noone else updates it. */ + if ( rc == 0 ) + { +@@ -2062,7 +2072,7 @@ static int __put_page_type(struct page_i + x, nx)) != x) ) + continue; + /* We cleared the 'valid bit' so we do the clean up. */ +- return free_page_type(page, x, preemptible); ++ return __put_final_page_type(page, x, preemptible); + } + + /* diff --git a/18735-x86-dom-cleanup.patch b/18735-x86-dom-cleanup.patch new file mode 100644 index 0000000..7a169b5 --- /dev/null +++ b/18735-x86-dom-cleanup.patch @@ -0,0 +1,22 @@ +# HG changeset patch +# User Keir Fraser +# Date 1225285777 0 +# Node ID ae100f264f6ad4e828de1ca2d228cccf6ed2bbfd +# Parent 183d2d7adc2f02db63aedaf199e3b006d2e4a053 +x86: Fix relinquish_memory() for PGT_partial pages. + +Original patch by Jan Beulich. + +Signed-off-by: Keir Fraser + +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -1687,6 +1687,8 @@ static int relinquish_memory( + { + if ( free_page_type(page, x, 0) != 0 ) + BUG(); ++ if ( x & PGT_partial ) ++ page->u.inuse.type_info--; + break; + } + } diff --git a/x86-dom-cleanup-no-hack.patch b/18741-x86-dom-cleanup-no-hack.patch similarity index 92% rename from x86-dom-cleanup-no-hack.patch rename to 18741-x86-dom-cleanup-no-hack.patch index 3769f8e..7b66581 100644 --- a/x86-dom-cleanup-no-hack.patch +++ b/18741-x86-dom-cleanup-no-hack.patch @@ -1,3 +1,13 @@ +# HG changeset patch +# User Keir Fraser +# Date 1225377468 0 +# Node ID 9e5cf6778a6d1057900c3709f544ac176ddfab67 +# Parent 112e81ae5824e213b181a65f944b729ba270d658 +x86: eliminate domain cleanup hack in favor of using the preemptable +flavors of the respective functions. + +Signed-off-by: Jan Beulich + --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -1639,32 +1639,23 @@ static int relinquish_memory( diff --git a/x86-partial-page-ref.patch b/18742-x86-partial-page-ref.patch similarity index 81% rename from x86-partial-page-ref.patch rename to 18742-x86-partial-page-ref.patch index 4510ca9..800851a 100644 --- a/x86-partial-page-ref.patch +++ b/18742-x86-partial-page-ref.patch @@ -1,14 +1,22 @@ --unstable staging c/s 18742+18747 +# HG changeset patch +# User Keir Fraser +# Date 1225378404 0 +# Node ID ed30f4efb728980ba84c34fc7fdc7be5f5a4a78e +# Parent 9e5cf6778a6d1057900c3709f544ac176ddfab67 +x86: fix preemptable page type handling - retain a page reference when PGT_partial is set on a page (and drop it when clearing that flag) -- don't drop a page reference never acquired when freeing the page type +- don't drop a page reference never acquired when freeing the page + type of a page where the allocation of the type got preempted (and never completed) - don't acquire a page reference when allocating the page type of a page where freeing the type got preempted (and never completed, and hence didn't drop the respective reference) +Signed-off-by: Jan Beulich + --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -1683,18 +1683,24 @@ static int relinquish_memory( @@ -324,126 +332,96 @@ return rc; } -@@ -1837,7 +1856,8 @@ int get_page(struct page_info *page, str - nx = x + 1; - d = nd; - if ( unlikely((x & PGC_count_mask) == 0) || /* Not allocated? */ -- unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */ -+ /* Keep one spare reference to be acquired by get_page_light(). */ -+ unlikely(((nx + 1) & PGC_count_mask) <= 1) || /* Overflow? */ - unlikely(d != _domain) ) /* Wrong owner? */ - { - if ( !_shadow_mode_refcounts(domain) && !domain->is_dying ) -@@ -1859,6 +1879,28 @@ int get_page(struct page_info *page, str - return 1; - } +@@ -1866,6 +1885,10 @@ static int alloc_page_type(struct page_i + struct domain *owner = page_get_owner(page); + int rc; -+/* -+ * Special version of get_page() to be used exclusively when -+ * - a page is known to already have a non-zero reference count -+ * - the page does not need its owner to be checked -+ * - it will not be called more than once without dropping the thus -+ * acquired reference again. -+ * Due to get_page() reserving one reference, this call cannot fail. -+ */ -+static void get_page_light(struct page_info *page) -+{ -+ u32 x, nx, y = page->count_info; ++ /* Obtain an extra reference to retain if we set PGT_partial. */ ++ if ( preemptible && !get_page(page, owner) ) ++ return -EINVAL; + -+ do { -+ x = y; -+ nx = x + 1; -+ BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */ -+ BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */ -+ y = cmpxchg(&page->count_info, x, nx); -+ } -+ while ( unlikely(y != x) ); -+} -+ - - static int alloc_page_type(struct page_info *page, unsigned long type, - int preemptible) -@@ -1899,6 +1941,7 @@ static int alloc_page_type(struct page_i - wmb(); + /* A page table is dirtied when its type count becomes non-zero. */ + if ( likely(owner != NULL) ) + paging_mark_dirty(owner, page_to_mfn(page)); +@@ -1900,8 +1923,13 @@ static int alloc_page_type(struct page_i if ( rc == -EAGAIN ) { -+ get_page_light(page); page->u.inuse.type_info |= PGT_partial; ++ return -EAGAIN; } - else if ( rc == -EINTR ) -@@ -2009,8 +2052,8 @@ static int __put_page_type_final(struct - page->u.inuse.type_info--; - break; - case -EINTR: -- ASSERT(!(page->u.inuse.type_info & -- (PGT_count_mask|PGT_validated|PGT_partial))); -+ ASSERT((page->u.inuse.type_info & -+ (PGT_count_mask|PGT_validated|PGT_partial)) == 1); - if ( !(shadow_mode_enabled(page_get_owner(page)) && - (page->count_info & PGC_page_table)) ) - page->tlbflush_timestamp = tlbflush_current_time(); -@@ -2019,6 +2062,7 @@ static int __put_page_type_final(struct - break; - case -EAGAIN: - wmb(); -+ get_page_light(page); - page->u.inuse.type_info |= PGT_partial; - break; - default: -@@ -2033,6 +2077,7 @@ static int __put_page_type(struct page_i - int preemptible) - { - unsigned long nx, x, y = page->u.inuse.type_info; -+ int rc = 0; - - for ( ; ; ) - { -@@ -2056,7 +2101,10 @@ static int __put_page_type(struct page_i - x, nx)) != x) ) - continue; - /* We cleared the 'valid bit' so we do the clean up. */ -- return __put_page_type_final(page, x, preemptible); -+ rc = __put_page_type_final(page, x, preemptible); -+ if ( x & PGT_partial ) -+ put_page(page); -+ break; - } - - /* -@@ -2078,7 +2126,7 @@ static int __put_page_type(struct page_i - return -EINTR; - } - -- return 0; -+ return rc; - } - - -@@ -2086,6 +2134,7 @@ static int __get_page_type(struct page_i - int preemptible) - { - unsigned long nx, x, y = page->u.inuse.type_info; -+ int rc = 0; - - ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2))); - -@@ -2208,10 +2257,13 @@ static int __get_page_type(struct page_i - page->nr_validated_ptes = 0; - page->partial_pte = 0; - } -- return alloc_page_type(page, type, preemptible); -+ rc = alloc_page_type(page, type, preemptible); - } - -- return 0; -+ if ( (x & PGT_partial) && !(nx & PGT_partial) ) +- else if ( rc == -EINTR ) ++ ++ if ( preemptible ) + put_page(page); + -+ return rc; ++ if ( rc == -EINTR ) + { + ASSERT((page->u.inuse.type_info & + (PGT_count_mask|PGT_validated|PGT_partial)) == 1); +@@ -2029,8 +2057,13 @@ static int __put_final_page_type( + BUG_ON(rc != -EAGAIN); + wmb(); + page->u.inuse.type_info |= PGT_partial; ++ /* Must skip put_page() below. */ ++ preemptible = 0; + } + ++ if ( preemptible ) ++ put_page(page); ++ + return rc; } - void put_page_type(struct page_info *page) -@@ -2290,7 +2342,7 @@ int new_guest_cr3(unsigned long mfn) +@@ -2040,6 +2073,10 @@ static int __put_page_type(struct page_i + { + unsigned long nx, x, y = page->u.inuse.type_info; + ++ /* Obtain an extra reference to retain if we set PGT_partial. */ ++ if ( preemptible && !get_page(page, page_get_owner(page)) ) ++ return -EINVAL; ++ + for ( ; ; ) + { + x = y; +@@ -2061,6 +2098,8 @@ static int __put_page_type(struct page_i + if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, + x, nx)) != x) ) + continue; ++ if ( x & PGT_partial ) ++ put_page(page); + /* We cleared the 'valid bit' so we do the clean up. */ + return __put_final_page_type(page, x, preemptible); + } +@@ -2081,9 +2120,16 @@ static int __put_page_type(struct page_i + break; + + if ( preemptible && hypercall_preempt_check() ) ++ { ++ if ( preemptible ) ++ put_page(page); + return -EINTR; ++ } + } + ++ if ( preemptible ) ++ put_page(page); ++ + return 0; + } + +@@ -2187,7 +2233,11 @@ static int __get_page_type(struct page_i + } + + if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) ) ++ { ++ if ( (x & PGT_partial) && !(nx & PGT_partial) ) ++ put_page(page); + break; ++ } + + if ( preemptible && hypercall_preempt_check() ) + return -EINTR; +@@ -2296,7 +2346,7 @@ int new_guest_cr3(unsigned long mfn) #endif okay = paging_mode_refcounts(d) ? get_page_from_pagenr(mfn, d) @@ -452,7 +430,7 @@ if ( unlikely(!okay) ) { MEM_LOG("Error while installing new baseptr %lx", mfn); -@@ -2534,7 +2586,7 @@ int do_mmuext_op( +@@ -2540,7 +2590,7 @@ int do_mmuext_op( if ( paging_mode_refcounts(FOREIGNDOM) ) break; @@ -461,7 +439,7 @@ okay = !rc; if ( unlikely(!okay) ) { -@@ -2615,7 +2667,7 @@ int do_mmuext_op( +@@ -2621,7 +2671,7 @@ int do_mmuext_op( okay = get_page_from_pagenr(mfn, d); else okay = !get_page_and_type_from_pagenr( @@ -470,7 +448,7 @@ if ( unlikely(!okay) ) { MEM_LOG("Error while installing new mfn %lx", mfn); -@@ -2722,7 +2774,7 @@ int do_mmuext_op( +@@ -2728,7 +2778,7 @@ int do_mmuext_op( unsigned char *ptr; okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page, @@ -479,7 +457,7 @@ if ( unlikely(!okay) ) { MEM_LOG("Error while clearing mfn %lx", mfn); -@@ -2755,7 +2807,7 @@ int do_mmuext_op( +@@ -2761,7 +2811,7 @@ int do_mmuext_op( } okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page, diff --git a/18747-x86-partial-page-ref.patch b/18747-x86-partial-page-ref.patch new file mode 100644 index 0000000..2595543 --- /dev/null +++ b/18747-x86-partial-page-ref.patch @@ -0,0 +1,198 @@ +# HG changeset patch +# User Keir Fraser +# Date 1225708322 0 +# Node ID 540483d2a98f3fbabf06961cc0cc52e3c59c245b +# Parent 303b1014f91e5fa0783a5d7095626a47e82db9d0 +x86: simplify page reference handling for partially (in-)validated pages + +Simplify general page reference management for preempted (partially +[in-]validated) pages: Reserve on reference that can be acquired +without the risk of overflowing the reference count, thus allowing to +have a simplified get_page() equivalent that cannot fail (but must be +used with care). + +Doing this conversion pointed out a latent issue in the changes done +previously in this area: The extra reference must be acquired before +the 'normal' reference gets dropped, so the patch fixes this at once +in both the alloc_page_type() and free_page_type() paths (it's really +only the latter that failed to work with the change described above). + +Signed-off-by: Jan Beulich + +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1856,7 +1856,8 @@ int get_page(struct page_info *page, str + nx = x + 1; + d = nd; + if ( unlikely((x & PGC_count_mask) == 0) || /* Not allocated? */ +- unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */ ++ /* Keep one spare reference to be acquired by get_page_light(). */ ++ unlikely(((nx + 1) & PGC_count_mask) <= 1) || /* Overflow? */ + unlikely(d != _domain) ) /* Wrong owner? */ + { + if ( !_shadow_mode_refcounts(domain) && !domain->is_dying ) +@@ -1878,6 +1879,28 @@ int get_page(struct page_info *page, str + return 1; + } + ++/* ++ * Special version of get_page() to be used exclusively when ++ * - a page is known to already have a non-zero reference count ++ * - the page does not need its owner to be checked ++ * - it will not be called more than once without dropping the thus ++ * acquired reference again. ++ * Due to get_page() reserving one reference, this call cannot fail. ++ */ ++static void get_page_light(struct page_info *page) ++{ ++ u32 x, nx, y = page->count_info; ++ ++ do { ++ x = y; ++ nx = x + 1; ++ BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */ ++ BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */ ++ y = cmpxchg(&page->count_info, x, nx); ++ } ++ while ( unlikely(y != x) ); ++} ++ + + static int alloc_page_type(struct page_info *page, unsigned long type, + int preemptible) +@@ -1885,10 +1908,6 @@ static int alloc_page_type(struct page_i + struct domain *owner = page_get_owner(page); + int rc; + +- /* Obtain an extra reference to retain if we set PGT_partial. */ +- if ( preemptible && !get_page(page, owner) ) +- return -EINVAL; +- + /* A page table is dirtied when its type count becomes non-zero. */ + if ( likely(owner != NULL) ) + paging_mark_dirty(owner, page_to_mfn(page)); +@@ -1922,14 +1941,10 @@ static int alloc_page_type(struct page_i + wmb(); + if ( rc == -EAGAIN ) + { ++ get_page_light(page); + page->u.inuse.type_info |= PGT_partial; +- return -EAGAIN; + } +- +- if ( preemptible ) +- put_page(page); +- +- if ( rc == -EINTR ) ++ else if ( rc == -EINTR ) + { + ASSERT((page->u.inuse.type_info & + (PGT_count_mask|PGT_validated|PGT_partial)) == 1); +@@ -2044,8 +2059,8 @@ static int __put_final_page_type( + } + else if ( rc == -EINTR ) + { +- ASSERT(!(page->u.inuse.type_info & +- (PGT_count_mask|PGT_validated|PGT_partial))); ++ ASSERT((page->u.inuse.type_info & ++ (PGT_count_mask|PGT_validated|PGT_partial)) == 1); + if ( !(shadow_mode_enabled(page_get_owner(page)) && + (page->count_info & PGC_page_table)) ) + page->tlbflush_timestamp = tlbflush_current_time(); +@@ -2056,14 +2071,10 @@ static int __put_final_page_type( + { + BUG_ON(rc != -EAGAIN); + wmb(); ++ get_page_light(page); + page->u.inuse.type_info |= PGT_partial; +- /* Must skip put_page() below. */ +- preemptible = 0; + } + +- if ( preemptible ) +- put_page(page); +- + return rc; + } + +@@ -2072,10 +2083,7 @@ static int __put_page_type(struct page_i + int preemptible) + { + unsigned long nx, x, y = page->u.inuse.type_info; +- +- /* Obtain an extra reference to retain if we set PGT_partial. */ +- if ( preemptible && !get_page(page, page_get_owner(page)) ) +- return -EINVAL; ++ int rc = 0; + + for ( ; ; ) + { +@@ -2098,10 +2106,11 @@ static int __put_page_type(struct page_i + if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, + x, nx)) != x) ) + continue; ++ /* We cleared the 'valid bit' so we do the clean up. */ ++ rc = __put_final_page_type(page, x, preemptible); + if ( x & PGT_partial ) + put_page(page); +- /* We cleared the 'valid bit' so we do the clean up. */ +- return __put_final_page_type(page, x, preemptible); ++ break; + } + + /* +@@ -2120,17 +2129,10 @@ static int __put_page_type(struct page_i + break; + + if ( preemptible && hypercall_preempt_check() ) +- { +- if ( preemptible ) +- put_page(page); + return -EINTR; +- } + } + +- if ( preemptible ) +- put_page(page); +- +- return 0; ++ return rc; + } + + +@@ -2138,6 +2140,7 @@ static int __get_page_type(struct page_i + int preemptible) + { + unsigned long nx, x, y = page->u.inuse.type_info; ++ int rc = 0; + + ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2))); + +@@ -2233,11 +2236,7 @@ static int __get_page_type(struct page_i + } + + if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) ) +- { +- if ( (x & PGT_partial) && !(nx & PGT_partial) ) +- put_page(page); + break; +- } + + if ( preemptible && hypercall_preempt_check() ) + return -EINTR; +@@ -2264,10 +2263,13 @@ static int __get_page_type(struct page_i + page->nr_validated_ptes = 0; + page->partial_pte = 0; + } +- return alloc_page_type(page, type, preemptible); ++ rc = alloc_page_type(page, type, preemptible); + } + +- return 0; ++ if ( (x & PGT_partial) && !(nx & PGT_partial) ) ++ put_page(page); ++ ++ return rc; + } + + void put_page_type(struct page_info *page) diff --git a/18771-reduce-GDT-switching.patch b/18771-reduce-GDT-switching.patch new file mode 100644 index 0000000..fb81d11 --- /dev/null +++ b/18771-reduce-GDT-switching.patch @@ -0,0 +1,347 @@ +# HG changeset patch +# User Keir Fraser +# Date 1226491295 0 +# Node ID 8e18dd41c6c7bb0980b29393b275c564cfb96437 +# Parent 2bd99c5faa420612544a9d94e298332e0e72a86a +x86: reduce GDT switching + +Both idle and HVM vCPU-s can easily run on the GDT mapped into general +hypervisor space (rather than that placed in per-vCPU virtual space). + +This makes unnecessary some of the additions c/s 18520 did. + +Signed-off-by: Jan Beulich + +Index: xen-3.3.1-testing/xen/arch/x86/cpu/common.c +=================================================================== +--- xen-3.3.1-testing.orig/xen/arch/x86/cpu/common.c ++++ xen-3.3.1-testing/xen/arch/x86/cpu/common.c +@@ -564,7 +564,10 @@ void __cpuinit cpu_init(void) + { + int cpu = smp_processor_id(); + struct tss_struct *t = &init_tss[cpu]; +- char gdt_load[10]; ++ struct desc_ptr gdt_desc = { ++ .base = (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY), ++ .limit = LAST_RESERVED_GDT_BYTE ++ }; + + if (cpu_test_and_set(cpu, cpu_initialized)) { + printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); +@@ -578,9 +581,7 @@ void __cpuinit cpu_init(void) + /* Install correct page table. */ + write_ptbase(current); + +- *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE; +- *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(current); +- asm volatile ( "lgdt %0" : "=m" (gdt_load) ); ++ asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); + + /* No nested task. */ + asm volatile ("pushf ; andw $0xbfff,(%"__OP"sp) ; popf" ); +Index: xen-3.3.1-testing/xen/arch/x86/domain.c +=================================================================== +--- xen-3.3.1-testing.orig/xen/arch/x86/domain.c ++++ xen-3.3.1-testing/xen/arch/x86/domain.c +@@ -309,12 +309,7 @@ int vcpu_initialise(struct vcpu *v) + if ( is_idle_domain(d) ) + { + v->arch.schedule_tail = continue_idle_domain; +- if ( v->vcpu_id ) +- v->arch.cr3 = d->vcpu[0]->arch.cr3; +- else if ( !*idle_vcpu ) +- v->arch.cr3 = __pa(idle_pg_table); +- else if ( !(v->arch.cr3 = clone_idle_pagetable(v)) ) +- return -ENOMEM; ++ v->arch.cr3 = __pa(idle_pg_table); + } + + v->arch.guest_context.ctrlreg[4] = +@@ -1171,14 +1166,18 @@ static void paravirt_ctxt_switch_to(stru + } + } + ++static inline int need_full_gdt(struct vcpu *v) ++{ ++ return (!is_hvm_vcpu(v) && !is_idle_vcpu(v)); ++} ++ + static void __context_switch(void) + { + struct cpu_user_regs *stack_regs = guest_cpu_user_regs(); +- unsigned int i, cpu = smp_processor_id(); ++ unsigned int cpu = smp_processor_id(); + struct vcpu *p = per_cpu(curr_vcpu, cpu); + struct vcpu *n = current; + struct desc_struct *gdt; +- struct page_info *page; + struct desc_ptr gdt_desc; + + ASSERT(p != n); +@@ -1207,16 +1206,19 @@ static void __context_switch(void) + + gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) : + per_cpu(compat_gdt_table, cpu); +- page = virt_to_page(gdt); +- for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i) ++ if ( need_full_gdt(n) ) + { +- l1e_write(n->domain->arch.mm_perdomain_pt + +- (n->vcpu_id << GDT_LDT_VCPU_SHIFT) + +- FIRST_RESERVED_GDT_PAGE + i, +- l1e_from_page(page + i, __PAGE_HYPERVISOR)); ++ struct page_info *page = virt_to_page(gdt); ++ unsigned int i; ++ for ( i = 0; i < NR_RESERVED_GDT_PAGES; i++ ) ++ l1e_write(n->domain->arch.mm_perdomain_pt + ++ (n->vcpu_id << GDT_LDT_VCPU_SHIFT) + ++ FIRST_RESERVED_GDT_PAGE + i, ++ l1e_from_page(page + i, __PAGE_HYPERVISOR)); + } + +- if ( p->vcpu_id != n->vcpu_id ) ++ if ( need_full_gdt(p) && ++ ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(n)) ) + { + gdt_desc.limit = LAST_RESERVED_GDT_BYTE; + gdt_desc.base = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY); +@@ -1225,8 +1227,10 @@ static void __context_switch(void) + + write_ptbase(n); + +- if ( p->vcpu_id != n->vcpu_id ) ++ if ( need_full_gdt(n) && ++ ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(p)) ) + { ++ gdt_desc.limit = LAST_RESERVED_GDT_BYTE; + gdt_desc.base = GDT_VIRT_START(n); + asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); + } +Index: xen-3.3.1-testing/xen/arch/x86/domain_build.c +=================================================================== +--- xen-3.3.1-testing.orig/xen/arch/x86/domain_build.c ++++ xen-3.3.1-testing/xen/arch/x86/domain_build.c +@@ -707,6 +707,7 @@ int __init construct_dom0( + + /* Install the new page tables. */ + local_irq_disable(); ++ /* We run on dom0's page tables for the final part of the build process. */ + write_ptbase(v); + + /* Copy the OS image and free temporary buffer. */ +@@ -719,11 +720,11 @@ int __init construct_dom0( + (parms.virt_hypercall >= v_end) ) + { + write_ptbase(current); +- local_irq_enable(); + printk("Invalid HYPERCALL_PAGE field in ELF notes.\n"); + return -1; + } +- hypercall_page_initialise(d, (void *)(unsigned long)parms.virt_hypercall); ++ hypercall_page_initialise( ++ d, (void *)(unsigned long)parms.virt_hypercall); + } + + /* Copy the initial ramdisk. */ +@@ -804,7 +805,7 @@ int __init construct_dom0( + xlat_start_info(si, XLAT_start_info_console_dom0); + #endif + +- /* Reinstate the caller's page tables. */ ++ /* Return to idle domain's page tables. */ + write_ptbase(current); + local_irq_enable(); + +Index: xen-3.3.1-testing/xen/arch/x86/hvm/vmx/vmcs.c +=================================================================== +--- xen-3.3.1-testing.orig/xen/arch/x86/hvm/vmx/vmcs.c ++++ xen-3.3.1-testing/xen/arch/x86/hvm/vmx/vmcs.c +@@ -444,6 +444,8 @@ static void vmx_set_host_env(struct vcpu + { + unsigned int cpu = smp_processor_id(); + ++ __vmwrite(HOST_GDTR_BASE, ++ (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY)); + __vmwrite(HOST_IDTR_BASE, (unsigned long)idt_tables[cpu]); + + __vmwrite(HOST_TR_SELECTOR, TSS_ENTRY << 3); +@@ -541,9 +543,6 @@ static int construct_vmcs(struct vcpu *v + __vmwrite(IO_BITMAP_A, virt_to_maddr((char *)hvm_io_bitmap + 0)); + __vmwrite(IO_BITMAP_B, virt_to_maddr((char *)hvm_io_bitmap + PAGE_SIZE)); + +- /* Host GDTR base. */ +- __vmwrite(HOST_GDTR_BASE, GDT_VIRT_START(v)); +- + /* Host data selectors. */ + __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS); + __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS); +Index: xen-3.3.1-testing/xen/arch/x86/setup.c +=================================================================== +--- xen-3.3.1-testing.orig/xen/arch/x86/setup.c ++++ xen-3.3.1-testing/xen/arch/x86/setup.c +@@ -230,7 +230,6 @@ static void __init percpu_init_areas(voi + static void __init init_idle_domain(void) + { + struct domain *idle_domain; +- unsigned int i; + + /* Domain creation requires that scheduler structures are initialised. */ + scheduler_init(); +@@ -243,12 +242,6 @@ static void __init init_idle_domain(void + idle_vcpu[0] = this_cpu(curr_vcpu) = current; + + setup_idle_pagetable(); +- +- for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i) +- idle_domain->arch.mm_perdomain_pt[FIRST_RESERVED_GDT_PAGE + i] = +- l1e_from_page(virt_to_page(boot_cpu_gdt_table) + i, +- __PAGE_HYPERVISOR); +- + } + + static void __init srat_detect_node(int cpu) +@@ -456,6 +449,7 @@ void __init __start_xen(unsigned long mb + parse_video_info(); + + set_current((struct vcpu *)0xfffff000); /* debug sanity */ ++ idle_vcpu[0] = current; + set_processor_id(0); /* needed early, for smp_processor_id() */ + if ( cpu_has_efer ) + rdmsrl(MSR_EFER, this_cpu(efer)); +Index: xen-3.3.1-testing/xen/arch/x86/smpboot.c +=================================================================== +--- xen-3.3.1-testing.orig/xen/arch/x86/smpboot.c ++++ xen-3.3.1-testing/xen/arch/x86/smpboot.c +@@ -828,7 +828,7 @@ static int __devinit do_boot_cpu(int api + */ + { + unsigned long boot_error; +- unsigned int i; ++ unsigned int order; + int timeout; + unsigned long start_eip; + unsigned short nmi_high = 0, nmi_low = 0; +@@ -864,21 +864,21 @@ static int __devinit do_boot_cpu(int api + + gdt = per_cpu(gdt_table, cpu); + if (gdt == boot_cpu_gdt_table) { +- i = get_order_from_pages(NR_RESERVED_GDT_PAGES); ++ order = get_order_from_pages(NR_RESERVED_GDT_PAGES); + #ifdef __x86_64__ + #ifdef CONFIG_COMPAT +- page = alloc_domheap_pages(NULL, i, ++ page = alloc_domheap_pages(NULL, order, + MEMF_node(cpu_to_node(cpu))); + per_cpu(compat_gdt_table, cpu) = gdt = page_to_virt(page); + memcpy(gdt, boot_cpu_compat_gdt_table, + NR_RESERVED_GDT_PAGES * PAGE_SIZE); + gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu; + #endif +- page = alloc_domheap_pages(NULL, i, ++ page = alloc_domheap_pages(NULL, order, + MEMF_node(cpu_to_node(cpu))); + per_cpu(gdt_table, cpu) = gdt = page_to_virt(page); + #else +- per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(i); ++ per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order); + #endif + memcpy(gdt, boot_cpu_gdt_table, + NR_RESERVED_GDT_PAGES * PAGE_SIZE); +@@ -886,13 +886,6 @@ static int __devinit do_boot_cpu(int api + gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu; + } + +- for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i) +- v->domain->arch.mm_perdomain_pt +- [(v->vcpu_id << GDT_LDT_VCPU_SHIFT) + +- FIRST_RESERVED_GDT_PAGE + i] +- = l1e_from_page(virt_to_page(gdt) + i, +- __PAGE_HYPERVISOR); +- + #ifdef __i386__ + if (!per_cpu(doublefault_tss, cpu)) { + per_cpu(doublefault_tss, cpu) = alloc_xenheap_page(); +Index: xen-3.3.1-testing/xen/arch/x86/x86_32/mm.c +=================================================================== +--- xen-3.3.1-testing.orig/xen/arch/x86/x86_32/mm.c ++++ xen-3.3.1-testing/xen/arch/x86/x86_32/mm.c +@@ -132,30 +132,6 @@ void __init setup_idle_pagetable(void) + __PAGE_HYPERVISOR)); + } + +-unsigned long clone_idle_pagetable(struct vcpu *v) +-{ +- unsigned int i; +- struct domain *d = v->domain; +- l3_pgentry_t *l3_table = v->arch.pae_l3_cache.table[0]; +- l2_pgentry_t *l2_table = alloc_xenheap_page(); +- +- if ( !l2_table ) +- return 0; +- +- memcpy(l3_table, idle_pg_table, L3_PAGETABLE_ENTRIES * sizeof(*l3_table)); +- l3_table[l3_table_offset(PERDOMAIN_VIRT_START)] = +- l3e_from_page(virt_to_page(l2_table), _PAGE_PRESENT); +- +- copy_page(l2_table, idle_pg_table_l2 + +- l3_table_offset(PERDOMAIN_VIRT_START) * L2_PAGETABLE_ENTRIES); +- for ( i = 0; i < PDPT_L2_ENTRIES; ++i ) +- l2_table[l2_table_offset(PERDOMAIN_VIRT_START) + i] = +- l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i, +- __PAGE_HYPERVISOR); +- +- return __pa(l3_table); +-} +- + void __init zap_low_mappings(l2_pgentry_t *dom0_l2) + { + int i; +Index: xen-3.3.1-testing/xen/arch/x86/x86_64/mm.c +=================================================================== +--- xen-3.3.1-testing.orig/xen/arch/x86/x86_64/mm.c ++++ xen-3.3.1-testing/xen/arch/x86/x86_64/mm.c +@@ -21,7 +21,6 @@ + #include + #include + #include +-#include + #include + #include + #include +@@ -207,24 +206,6 @@ void __init setup_idle_pagetable(void) + __PAGE_HYPERVISOR)); + } + +-unsigned long clone_idle_pagetable(struct vcpu *v) +-{ +- struct domain *d = v->domain; +- struct page_info *page = alloc_domheap_page(NULL, +- MEMF_node(vcpu_to_node(v))); +- l4_pgentry_t *l4_table = page_to_virt(page); +- +- if ( !page ) +- return 0; +- +- copy_page(l4_table, idle_pg_table); +- l4_table[l4_table_offset(PERDOMAIN_VIRT_START)] = +- l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3), +- __PAGE_HYPERVISOR); +- +- return __pa(l4_table); +-} +- + void __init zap_low_mappings(void) + { + BUG_ON(num_online_cpus() != 1); +Index: xen-3.3.1-testing/xen/include/asm-x86/page.h +=================================================================== +--- xen-3.3.1-testing.orig/xen/include/asm-x86/page.h ++++ xen-3.3.1-testing/xen/include/asm-x86/page.h +@@ -278,7 +278,6 @@ extern unsigned int m2p_compat_vstart; + #endif + void paging_init(void); + void setup_idle_pagetable(void); +-unsigned long clone_idle_pagetable(struct vcpu *); + #endif /* !defined(__ASSEMBLY__) */ + + #define _PAGE_PRESENT 0x001U diff --git a/18778-msi-irq-fix.patch b/18778-msi-irq-fix.patch new file mode 100644 index 0000000..996397c --- /dev/null +++ b/18778-msi-irq-fix.patch @@ -0,0 +1,92 @@ +# HG changeset patch +# User Keir Fraser +# Date 1226593868 0 +# Node ID a0910b1b5ec0c938f1c46437df6c28cbeff52c68 +# Parent d44ad6db638c1308e5ee4a47509769c3cccbe1e8 +x86: don't disable MSI in order to mask an IRQ + +... as that's not really correct, and there are devices which can't +even cope with that. Instead, check whether an MSI IRQ can be masked, +and if it can't, treat it just like a level triggered IO-APIC IRQ. + +There's one other bug fix in here, correcting an off-by-one error on +the entry_nr range check in __pci_enable_msix(). + +Signed-off-by: Jan Beulich + +Index: xen-3.3.1-testing/xen/arch/x86/irq.c +=================================================================== +--- xen-3.3.1-testing.orig/xen/arch/x86/irq.c ++++ xen-3.3.1-testing/xen/arch/x86/irq.c +@@ -463,14 +463,19 @@ int pirq_acktype(struct domain *d, int i + /* + * Edge-triggered IO-APIC and LAPIC interrupts need no final + * acknowledgement: we ACK early during interrupt processing. +- * MSIs are treated as edge-triggered interrupts. + */ + if ( !strcmp(desc->handler->typename, "IO-APIC-edge") || +- !strcmp(desc->handler->typename, "local-APIC-edge") || +- !strcmp(desc->handler->typename, "PCI-MSI") ) ++ !strcmp(desc->handler->typename, "local-APIC-edge") ) + return ACKTYPE_NONE; + + /* ++ * MSIs are treated as edge-triggered interrupts, except ++ * when there is no proper way to mask them. ++ */ ++ if ( desc->handler == &pci_msi_type ) ++ return msi_maskable_irq(desc->msi_desc) ? ACKTYPE_NONE : ACKTYPE_EOI; ++ ++ /* + * Level-triggered IO-APIC interrupts need to be acknowledged on the CPU + * on which they were received. This is because we tickle the LAPIC to EOI. + */ +Index: xen-3.3.1-testing/xen/arch/x86/msi.c +=================================================================== +--- xen-3.3.1-testing.orig/xen/arch/x86/msi.c ++++ xen-3.3.1-testing/xen/arch/x86/msi.c +@@ -303,6 +303,13 @@ static void msix_flush_writes(unsigned i + } + } + ++int msi_maskable_irq(const struct msi_desc *entry) ++{ ++ BUG_ON(!entry); ++ return entry->msi_attrib.type != PCI_CAP_ID_MSI ++ || entry->msi_attrib.maskbit; ++} ++ + static void msi_set_mask_bit(unsigned int irq, int flag) + { + struct msi_desc *entry = irq_desc[irq].msi_desc; +@@ -323,8 +330,6 @@ static void msi_set_mask_bit(unsigned in + mask_bits &= ~(1); + mask_bits |= flag; + pci_conf_write32(bus, slot, func, pos, mask_bits); +- } else { +- msi_set_enable(entry->dev, !flag); + } + break; + case PCI_CAP_ID_MSIX: +@@ -654,7 +659,7 @@ static int __pci_enable_msix(struct msi_ + pos = pci_find_cap_offset(msi->bus, slot, func, PCI_CAP_ID_MSIX); + control = pci_conf_read16(msi->bus, slot, func, msi_control_reg(pos)); + nr_entries = multi_msix_capable(control); +- if (msi->entry_nr > nr_entries) ++ if (msi->entry_nr >= nr_entries) + { + spin_unlock(&pdev->lock); + return -EINVAL; +Index: xen-3.3.1-testing/xen/include/asm-x86/msi.h +=================================================================== +--- xen-3.3.1-testing.orig/xen/include/asm-x86/msi.h ++++ xen-3.3.1-testing/xen/include/asm-x86/msi.h +@@ -97,6 +97,8 @@ struct msi_desc { + int remap_index; /* index in interrupt remapping table */ + }; + ++int msi_maskable_irq(const struct msi_desc *); ++ + /* + * Assume the maximum number of hot plug slots supported by the system is about + * ten. The worstcase is that each of these slots is hot-added with a device, diff --git a/blktap.patch b/blktap.patch index fcd3257..224cea7 100644 --- a/blktap.patch +++ b/blktap.patch @@ -45,7 +45,7 @@ Index: xen-3.3.1-testing/tools/ioemu-remote/xenstore.c /* read the name of the device */ if (pasprintf(&buf, "%s/dev", bpath) == -1) continue; -@@ -429,6 +437,7 @@ void xenstore_parse_domain_config(int hv +@@ -432,6 +440,7 @@ void xenstore_parse_domain_config(int hv free(type); free(params); free(dev); diff --git a/cdrom-removable.patch b/cdrom-removable.patch index f628434..8d48e15 100644 --- a/cdrom-removable.patch +++ b/cdrom-removable.patch @@ -401,24 +401,27 @@ Index: xen-3.3.1-testing/tools/ioemu-remote/xenstore.c =================================================================== --- xen-3.3.1-testing.orig/tools/ioemu-remote/xenstore.c +++ xen-3.3.1-testing/tools/ioemu-remote/xenstore.c -@@ -297,6 +297,16 @@ void xenstore_parse_domain_config(int hv - bdrv_set_type_hint(bs, BDRV_TYPE_CDROM); - if (pasprintf(&buf, "%s/params", bpath) != -1) - xs_watch(xsh, buf, dev); -+ /* if pyhsical put a watch on media-present */ +@@ -333,6 +333,19 @@ void xenstore_parse_domain_config(int hv + if (bdrv_open2(bs, params, 0 /* snapshot */, format) < 0) + fprintf(stderr, "qemu: could not open vbd '%s' or hard disk image '%s' (drv '%s' format '%s')\n", buf, params, drv ? drv : "?", format ? format->format_name : "0"); + } ++ /* if cdrom pyhsical put a watch on media-present */ ++ if (bdrv_get_type_hint(bs) == BDRV_TYPE_CDROM) { + if (drv && !strcmp(drv, "phy")) { + if (pasprintf(&buf, "%s/media-present", bpath) != -1) { + if (bdrv_is_inserted(bs)) + xs_write(xsh, XBT_NULL, buf, "1", strlen("1")); -+ else ++ else { + xs_write(xsh, XBT_NULL, buf, "0", strlen("0")); ++ } + xs_watch(xsh, buf, "media-present"); + } + } - } ++ } - /* open device now if media present */ -@@ -631,6 +641,50 @@ void xenstore_record_dm_state(char *stat + drives_table[nb_drives].bdrv = bs; + nb_drives++; +@@ -631,6 +644,50 @@ void xenstore_record_dm_state(char *stat xenstore_record_dm("state", state); } @@ -469,7 +472,7 @@ Index: xen-3.3.1-testing/tools/ioemu-remote/xenstore.c void xenstore_process_event(void *opaque) { char **vec, *offset, *bpath = NULL, *buf = NULL, *drv = NULL, *image = NULL; -@@ -650,6 +704,11 @@ void xenstore_process_event(void *opaque +@@ -650,6 +707,11 @@ void xenstore_process_event(void *opaque goto out; } diff --git a/hv_tools.patch b/hv_tools.patch index c425ef5..55412c4 100644 --- a/hv_tools.patch +++ b/hv_tools.patch @@ -45,7 +45,7 @@ Index: xen-3.3.1-testing/tools/python/xen/xend/image.py =================================================================== --- xen-3.3.1-testing.orig/tools/python/xen/xend/image.py +++ xen-3.3.1-testing/tools/python/xen/xend/image.py -@@ -697,6 +697,7 @@ class HVMImageHandler(ImageHandler): +@@ -703,6 +703,7 @@ class HVMImageHandler(ImageHandler): self.apic = int(vmConfig['platform'].get('apic', 0)) self.acpi = int(vmConfig['platform'].get('acpi', 0)) @@ -53,7 +53,7 @@ Index: xen-3.3.1-testing/tools/python/xen/xend/image.py self.guest_os_type = vmConfig['platform'].get('guest_os_type') -@@ -803,6 +804,7 @@ class HVMImageHandler(ImageHandler): +@@ -809,6 +810,7 @@ class HVMImageHandler(ImageHandler): log.debug("store_evtchn = %d", store_evtchn) log.debug("memsize = %d", mem_mb) log.debug("vcpus = %d", self.vm.getVCpuCount()) @@ -61,7 +61,7 @@ Index: xen-3.3.1-testing/tools/python/xen/xend/image.py log.debug("acpi = %d", self.acpi) log.debug("apic = %d", self.apic) -@@ -810,6 +812,7 @@ class HVMImageHandler(ImageHandler): +@@ -816,6 +818,7 @@ class HVMImageHandler(ImageHandler): image = self.loader, memsize = mem_mb, vcpus = self.vm.getVCpuCount(), diff --git a/hv_xen_base.patch b/hv_xen_base.patch index ce34961..fe900de 100644 --- a/hv_xen_base.patch +++ b/hv_xen_base.patch @@ -108,7 +108,7 @@ Index: xen-3.3.1-testing/xen/arch/x86/hvm/hvm.c return hvm_funcs.msr_write_intercept(regs); } -@@ -1961,6 +1977,10 @@ int hvm_do_hypercall(struct cpu_user_reg +@@ -2002,6 +2018,10 @@ int hvm_do_hypercall(struct cpu_user_reg case 0: break; } @@ -119,7 +119,7 @@ Index: xen-3.3.1-testing/xen/arch/x86/hvm/hvm.c if ( (eax >= NR_hypercalls) || !hvm_hypercall32_table[eax] ) { -@@ -2462,6 +2482,15 @@ long do_hvm_op(unsigned long op, XEN_GUE +@@ -2503,6 +2523,15 @@ long do_hvm_op(unsigned long op, XEN_GUE rc = -EINVAL; break; diff --git a/ioemu-blktap-zero-size.patch b/ioemu-blktap-zero-size.patch index c440385..ffedb36 100644 --- a/ioemu-blktap-zero-size.patch +++ b/ioemu-blktap-zero-size.patch @@ -31,7 +31,7 @@ Index: xen-3.3.1-testing/tools/ioemu-remote/hw/xen_blktap.c /*Open file*/ if (s == NULL || open_disk(s, path, msg->drivertype, msg->readonly)) { -@@ -624,7 +632,8 @@ static void handle_blktap_ctrlmsg(void* +@@ -645,7 +653,8 @@ static void handle_blktap_ctrlmsg(void* case CTLMSG_CLOSE: s = get_state(msg->cookie); if (s) unmap_disk(s); diff --git a/ioemu-vnc-resize.patch b/ioemu-vnc-resize.patch new file mode 100644 index 0000000..a2b3144 --- /dev/null +++ b/ioemu-vnc-resize.patch @@ -0,0 +1,63 @@ +Index: xen-3.3.1-testing/tools/ioemu-remote/vnc.c +=================================================================== +--- xen-3.3.1-testing.orig/tools/ioemu-remote/vnc.c ++++ xen-3.3.1-testing/tools/ioemu-remote/vnc.c +@@ -352,6 +352,11 @@ static void vnc_dpy_update(DisplayState + { + VncState *vs = ds->opaque; + ++ x = MIN(x, vs->width); ++ y = MIN(y, vs->height); ++ w = MIN(w, vs->width - x); ++ h = MIN(h, vs->height - y); ++ + set_bits_in_row(vs, vs->dirty_row, x, y, w, h); + } + +@@ -405,9 +413,13 @@ static void vnc_dpy_resize_shared(Displa + size_changed = ds->width != w || ds->height != h; + ds->width = w; + ds->height = h; +- if (vs->csock != -1 && vs->has_resize && size_changed) { ++ if (size_changed) { + vs->width = ds->width; + vs->height = ds->height; ++ VNC_DEBUG("vs->width = %d, vs->height = %d\n", ++ ds->width, ds->height); ++ } ++ if (vs->csock != -1 && vs->has_resize && size_changed) { + if (vs->update_requested) { + vnc_write_u8(vs, 0); /* msg id */ + vnc_write_u8(vs, 0); +@@ -1794,6 +1837,31 @@ static int protocol_client_msg(VncState + } + + set_encodings(vs, (int32_t *)(data + 4), limit); ++ ++ /* ++ * The initialization of a VNC connection can race with xenfb changing ++ * the resolution. This happens when the VNC connection is already ++ * established, but the client has not yet advertised has_resize, so it ++ * won't get notified of the switch. ++ * ++ * Therefore we resend the resolution as soon as the client has sent its ++ * encodings. ++ */ ++ if (vs->has_resize) { ++ /* Resize the VNC window */ ++ vnc_write_u8(vs, 0); /* msg id */ ++ vnc_write_u8(vs, 0); ++ vnc_write_u16(vs, 1); /* number of rects */ ++ vnc_framebuffer_update(vs, 0, 0, vs->ds->width, vs->ds->height, -223); ++ ++ /* Ensure that the new area is updated */ ++ vnc_write_u8(vs, 0); /* msg id */ ++ vnc_write_u8(vs, 0); ++ vnc_write_u16(vs, 1); /* number of rects */ ++ send_framebuffer_update(vs, 0, 0, vs->ds->width, vs->ds->height); ++ ++ vnc_flush(vs); ++ } + break; + case 3: + if (len == 1) diff --git a/snapshot-ioemu-delete.patch b/snapshot-ioemu-delete.patch index cfa229d..fe5daf9 100644 --- a/snapshot-ioemu-delete.patch +++ b/snapshot-ioemu-delete.patch @@ -2,7 +2,7 @@ Index: xen-3.3.1-testing/tools/ioemu-remote/xenstore.c =================================================================== --- xen-3.3.1-testing.orig/tools/ioemu-remote/xenstore.c +++ xen-3.3.1-testing/tools/ioemu-remote/xenstore.c -@@ -652,6 +652,19 @@ static void xenstore_process_dm_command_ +@@ -671,6 +671,18 @@ static void xenstore_process_dm_command_ } snapshot_name = xs_read(xsh, XBT_NULL, path, &len); @@ -16,9 +16,69 @@ Index: xen-3.3.1-testing/tools/ioemu-remote/xenstore.c + par = xs_read(xsh, XBT_NULL, path, &len); + if (!par) + goto out; -+ // TODO Error handling -+ do_delvm(par); -+ xenstore_record_dm_state("snapshot-deleted"); ++ if (delete_disk_snapshots(par) == 0) ++ xenstore_record_dm_state("snapshot-deleted"); } else if (!strncmp(command, "continue", len)) { fprintf(logfile, "dm-command: continue after state save\n"); xen_pause_requested = 0; +Index: xen-3.3.1-testing/tools/ioemu-remote/xen-vl-extra.c +=================================================================== +--- xen-3.3.1-testing.orig/tools/ioemu-remote/xen-vl-extra.c ++++ xen-3.3.1-testing/tools/ioemu-remote/xen-vl-extra.c +@@ -16,6 +16,8 @@ static int qemu_savevm_state(QEMUFile *f + static int qemu_loadvm_state(QEMUFile *f); + + static int bdrv_can_snapshot(BlockDriverState *bs); ++static int bdrv_has_snapshot(BlockDriverState *bs); ++static BlockDriverState *get_bs_snapshots(void); + static int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info, + const char *name); + +@@ -166,6 +168,35 @@ the_end: + return ret; + } + ++int delete_disk_snapshots(const char* name) ++{ ++ BlockDriverState *bs, *bs1; ++ int i, ret; ++ ++ bs = get_bs_snapshots(); ++ if (!bs) { ++ xenstore_record_dm_error("No block device supports snapshots"); ++ return -1; ++ } ++ ++ for(i = 0; i <= nb_drives; i++) { ++ bs1 = drives_table[i].bdrv; ++ if (bdrv_has_snapshot(bs1)) { ++ ret = bdrv_snapshot_delete(bs1, name); ++ if (ret < 0) { ++ if (ret == -ENOTSUP) ++ fprintf(stderr, "Snapshots not supported on device '%s'\n", ++ bdrv_get_device_name(bs1)); ++ else ++ fprintf(stderr, "Error %d while deleting snapshot on " ++ "'%s'\n", ret, bdrv_get_device_name(bs1)); ++ } ++ } ++ } ++ ++ return 0; ++} ++ + struct qemu_alarm_timer; + static int unix_start_timer(struct qemu_alarm_timer *t) { return 0; } + static void unix_stop_timer(struct qemu_alarm_timer *t) { } +Index: xen-3.3.1-testing/tools/ioemu-remote/qemu-xen.h +=================================================================== +--- xen-3.3.1-testing.orig/tools/ioemu-remote/qemu-xen.h ++++ xen-3.3.1-testing/tools/ioemu-remote/qemu-xen.h +@@ -22,6 +22,7 @@ enum { + + /* xen-vl-extra.c */ + int save_disk_snapshots(const char* name); ++int delete_disk_snapshots(const char* name); + + /* helper2.c */ + extern long time_offset; diff --git a/snapshot-ioemu-restore.patch b/snapshot-ioemu-restore.patch index b059fbf..cc7a3c8 100644 --- a/snapshot-ioemu-restore.patch +++ b/snapshot-ioemu-restore.patch @@ -35,7 +35,7 @@ Index: xen-3.3.1-testing/tools/ioemu-remote/hw/xen_blktap.c static void handle_blktap_iomsg(void* private); struct aiocb_info { -@@ -502,6 +504,10 @@ static void handle_blktap_ctrlmsg(void* +@@ -502,6 +504,10 @@ static void handle_blktap_ctrlmsg(void* char buf[MSG_SIZE]; @@ -46,7 +46,7 @@ Index: xen-3.3.1-testing/tools/ioemu-remote/hw/xen_blktap.c length = read(read_fd, buf, MSG_SIZE); if (length > 0 && length >= sizeof(msg_hdr_t)) -@@ -557,7 +563,18 @@ static void handle_blktap_ctrlmsg(void* +@@ -557,7 +563,39 @@ static void handle_blktap_ctrlmsg(void* if (s != NULL) { ret = ((map_new_dev(s, msg_dev->devnum) == msg_dev->devnum ? 0: -1)); @@ -57,10 +57,31 @@ Index: xen-3.3.1-testing/tools/ioemu-remote/hw/xen_blktap.c + fprintf(stderr, "Reading snapshot name for %d\n", msg_dev->be_id); + snapshot = get_snapshot_name(msg_dev->be_id); + if (snapshot) { -+ // TODO Error handling + fprintf(stderr, "Using snapshot %s\n", snapshot); -+ if (bdrv_snapshot_goto(s->bs, snapshot) < 0) ++ ret = bdrv_snapshot_goto(s->bs, snapshot); ++ switch (ret) { ++ case 0: ++ /* Success */ ++ break; ++ case -ENOTSUP: ++ if (s->flags & TD_RDONLY) { ++ fprintf(stderr, "Snapshots not supported for " ++ "image format of a read-only image\n"); ++ } else { ++ fprintf(stderr, "Snapshots not supported " ++ "for this image format"); ++ ret = -1; ++ } ++ break; ++ case -ENOENT: ++ fprintf(stderr, "No such snapshot"); ++ ret = -1; ++ break; ++ default: + fprintf(stderr, "Could not load snapshot"); ++ ret = -1; ++ break; ++ } + } +#endif @@ -79,7 +100,7 @@ Index: xen-3.3.1-testing/tools/ioemu-remote/xenstore.c static int pasprintf(char **buf, const char *fmt, ...) { va_list ap; -@@ -373,8 +375,17 @@ void xenstore_parse_domain_config(int hv +@@ -363,8 +365,33 @@ void xenstore_parse_domain_config(int hv } } pstrcpy(bs->filename, sizeof(bs->filename), params); @@ -89,16 +110,32 @@ Index: xen-3.3.1-testing/tools/ioemu-remote/xenstore.c + } else { + char* snapshot = get_snapshot_name(atoi(e[i])); + if (snapshot) { -+ // TODO Error handling + fprintf(stderr, "Using snapshot %s\n", snapshot); -+ if (bdrv_snapshot_goto(bs, snapshot) < 0) -+ fprintf(stderr, "Could not load snapshot"); ++ ret = bdrv_snapshot_goto(bs, snapshot); ++ switch (ret) { ++ case 0: ++ /* Success */ ++ break; ++ case -ENOTSUP: ++ /* Don't abort here (could be read-only ISO) */ ++ fprintf(stderr, "Snapshots are not supported for " ++ "this image file format\n"); ++ break; ++ case -ENOENT: ++ fprintf(stderr, "No such snapshot, skipping this " ++ "image file\n"); ++ continue; ++ default: ++ fprintf(stderr, "Could not load snapshot, skipping" ++ " this image file\n"); ++ continue; ++ } + } + } } - - drives_table[nb_drives].bdrv = bs; -@@ -471,6 +482,23 @@ void xenstore_parse_domain_config(int hv + /* if cdrom pyhsical put a watch on media-present */ + if (bdrv_get_type_hint(bs) == BDRV_TYPE_CDROM) { +@@ -474,6 +485,23 @@ void xenstore_parse_domain_config(int hv return; } diff --git a/snapshot-ioemu-save.patch b/snapshot-ioemu-save.patch index c79070c..6b3fbf0 100644 --- a/snapshot-ioemu-save.patch +++ b/snapshot-ioemu-save.patch @@ -117,7 +117,15 @@ Index: xen-3.3.1-testing/tools/ioemu-remote/i386-dm/helper2.c CPUX86State *cpu_x86_init(const char *cpu_model) { CPUX86State *env; -@@ -574,9 +577,24 @@ int main_loop(void) +@@ -547,6 +550,7 @@ int main_loop(void) + int evtchn_fd = xce_handle == -1 ? -1 : xc_evtchn_fd(xce_handle); + char *qemu_file; + fd_set fds; ++ int ret; + + main_loop_prepare(); + +@@ -574,11 +578,43 @@ int main_loop(void) main_loop_wait(1); /* For the select() on events */ /* Save the device state */ @@ -129,22 +137,42 @@ Index: xen-3.3.1-testing/tools/ioemu-remote/i386-dm/helper2.c + asprintf(&qemu_file, "/var/lib/xen/qemu-save.%d", domid); + do_savevm(qemu_file); + free(qemu_file); ++ xenstore_record_dm_state("paused"); + break; + + case SUSPEND_SNAPSHOT: -+ // TODO Error reporting + if (snapshot_name != NULL) { -+ save_disk_snapshots(snapshot_name); ++ ret = save_disk_snapshots(snapshot_name); + free(snapshot_name); + snapshot_name = NULL; ++ ++ switch (ret) { ++ case 0: ++ xenstore_record_dm_state("paused"); ++ break; ++ case -ENOTSUP: ++ xenstore_record_dm_error("Snapshots not supported on all" ++ " attached read-write disks"); ++ break; ++ case -ENOENT: ++ xenstore_record_dm_error("A snapshot with the same name" ++ " already exists"); ++ break; ++ default: ++ xenstore_record_dm_error("An error occurred while saving" ++ " the snapshot"); ++ break; ++ } + } else { -+ fprintf(logfile, "No snapshot name given\n"); ++ xenstore_record_dm_error("No snapshot name given"); + } + break; + } - xenstore_record_dm_state("paused"); +- xenstore_record_dm_state("paused"); + /* Wait to be allowed to continue */ + while (xen_pause_requested) { Index: xen-3.3.1-testing/tools/ioemu-remote/qemu-xen.h =================================================================== --- xen-3.3.1-testing.orig/tools/ioemu-remote/qemu-xen.h @@ -165,6 +193,14 @@ Index: xen-3.3.1-testing/tools/ioemu-remote/qemu-xen.h /* helper2.c */ extern long time_offset; void timeoffset_get(void); +@@ -43,6 +52,7 @@ int xenstore_fd(void); + void xenstore_process_event(void *opaque); + void xenstore_record_dm(char *subpath, char *state); + void xenstore_record_dm_state(char *state); ++void xenstore_record_dm_error(char *errmsg); + void xenstore_check_new_media_present(int timeout); + void xenstore_write_vncport(int vnc_display); + void xenstore_read_vncpasswd(int domid, char *pwbuf, size_t pwbuflen); Index: xen-3.3.1-testing/tools/ioemu-remote/xenstore.c =================================================================== --- xen-3.3.1-testing.orig/tools/ioemu-remote/xenstore.c @@ -177,7 +213,7 @@ Index: xen-3.3.1-testing/tools/ioemu-remote/xenstore.c #include "hw.h" #include "pci.h" -@@ -598,6 +599,7 @@ static void xenstore_process_dm_command_ +@@ -601,6 +602,7 @@ static void xenstore_process_dm_command_ { char *path = NULL, *command = NULL, *par = NULL; unsigned int len; @@ -185,7 +221,7 @@ Index: xen-3.3.1-testing/tools/ioemu-remote/xenstore.c if (pasprintf(&path, "/local/domain/0/device-model/%u/command", domid) == -1) { -@@ -610,7 +612,18 @@ static void xenstore_process_dm_command_ +@@ -613,7 +615,18 @@ static void xenstore_process_dm_command_ if (!strncmp(command, "save", len)) { fprintf(logfile, "dm-command: pause and save state\n"); @@ -205,3 +241,17 @@ Index: xen-3.3.1-testing/tools/ioemu-remote/xenstore.c } else if (!strncmp(command, "continue", len)) { fprintf(logfile, "dm-command: continue after state save\n"); xen_pause_requested = 0; +@@ -677,6 +690,13 @@ void xenstore_record_dm_state(char *stat + xenstore_record_dm("state", state); + } + ++void xenstore_record_dm_error(char *errmsg) ++{ ++ fprintf(logfile, "%s\n", errmsg); ++ xenstore_record_dm("error", errmsg); ++ xenstore_record_dm_state("error"); ++} ++ + void xenstore_process_media_change_event(char **vec) + { + char *media_present = NULL; diff --git a/snapshot-xend.patch b/snapshot-xend.patch index dc96d8b..1e6168b 100644 --- a/snapshot-xend.patch +++ b/snapshot-xend.patch @@ -2,7 +2,20 @@ Index: xen-3.3.1-testing/tools/python/xen/xend/image.py =================================================================== --- xen-3.3.1-testing.orig/tools/python/xen/xend/image.py +++ xen-3.3.1-testing/tools/python/xen/xend/image.py -@@ -476,6 +476,10 @@ class ImageHandler: +@@ -447,6 +447,12 @@ class ImageHandler: + while state != ret: + state = xstransact.Read("/local/domain/0/device-model/%i/state" + % self.vm.getDomid()) ++ if state == 'error': ++ msg = ("The device model returned an error: %s" ++ % xstransact.Read("/local/domain/0/device-model/%i/error" ++ % self.vm.getDomid())) ++ raise VmError(msg) ++ + time.sleep(0.1) + count += 1 + if count > 100: +@@ -476,6 +482,10 @@ class ImageHandler: # but this can easily lead to very rapid restart loops against # which we currently have no protection @@ -164,8 +177,6 @@ Index: xen-3.3.1-testing/tools/python/xen/xend/XendCheckpoint.py + log.info("Domain %d suspended.", dominfo.getDomid()) + dominfo.migrateDevices(network, dst, DEV_MIGRATE_STEP3, + domain_name) -+ if name: -+ dominfo.image.snapshotDeviceModel(name) + if hvm: + dominfo.image.saveDeviceModel() + @@ -192,6 +203,8 @@ Index: xen-3.3.1-testing/tools/python/xen/xend/XendCheckpoint.py + else: + dominfo.shutdown('suspend') + dominfo.waitForShutdown() ++ ++ if name: + dominfo.image.snapshotDeviceModel(name) + @@ -315,7 +328,7 @@ Index: xen-3.3.1-testing/tools/python/xen/xend/XendDomain.py xc = xen.lowlevel.xc.xc() xoptions = XendOptions.instance() -@@ -1400,6 +1401,164 @@ class XendDomain: +@@ -1400,6 +1401,181 @@ class XendDomain: raise XendError("can't write guest state file %s: %s" % (dst, ex[1])) @@ -348,6 +361,22 @@ Index: xen-3.3.1-testing/tools/python/xen/xend/XendDomain.py + raise XendError("Domain is not managed by Xend lifecycle " + + "support.") + ++ # Check if all images support snapshots ++ for dev_type, dev_info in dominfo.info.all_devices_sxpr(): ++ mode = sxp.child_value(dev_info, 'mode') ++ if mode == 'r': ++ continue; ++ if dev_type == 'vbd': ++ raise XendError("All writable images need to use the " + ++ "tap:qcow2 protocol for snapshot support") ++ if dev_type == 'tap': ++ # Fetch the protocol name from tap:xyz:filename ++ type = sxp.child_value(dev_info, 'uname') ++ type = type.split(':')[1] ++ if type != 'qcow2': ++ raise XendError("All writable images need to use the " + ++ "tap:qcow2 protocol for snapshot support") ++ + snap_path = os.path.join(xoptions.get_xend_domains_path(), + dominfo.get_uuid(), "snapshots") + mkdir.parents(snap_path, stat.S_IRWXU) @@ -363,6 +392,7 @@ Index: xen-3.3.1-testing/tools/python/xen/xend/XendDomain.py + True, name=name, diskonly=diskonly) + except Exception, e: + os.close(fd) ++ os.unlink(snap_file) + raise e + os.close(fd) + except OSError, ex: diff --git a/udev-rules.patch b/udev-rules.patch new file mode 100644 index 0000000..384d001 --- /dev/null +++ b/udev-rules.patch @@ -0,0 +1,9 @@ +Index: xen-3.3.1-testing/tools/examples/xen-backend.rules +=================================================================== +--- xen-3.3.1-testing.orig/tools/examples/xen-backend.rules ++++ xen-3.3.1-testing/tools/examples/xen-backend.rules +@@ -7,3 +7,4 @@ SUBSYSTEM=="xen-backend", KERNEL=="vscsi + SUBSYSTEM=="xen-backend", ACTION=="remove", RUN+="/etc/xen/scripts/xen-hotplug-cleanup" + KERNEL=="evtchn", NAME="xen/%k" + KERNEL=="blktap[0-9]*", NAME="xen/%k" ++KERNELS=="xen", KERNEL=="xvd*", SUBSYSTEM=="block", OPTIONS+="last_rule" diff --git a/x86-dom-cleanup.patch b/x86-dom-cleanup.patch deleted file mode 100644 index 78b2c6f..0000000 --- a/x86-dom-cleanup.patch +++ /dev/null @@ -1,157 +0,0 @@ -Equivalent of -unstable c/s 18720, 18731, and 18735. - ---- a/xen/arch/x86/domain.c -+++ b/xen/arch/x86/domain.c -@@ -1687,6 +1687,8 @@ static int relinquish_memory( - { - if ( free_page_type(page, x, 0) != 0 ) - BUG(); -+ if ( x & PGT_partial ) -+ page->u.inuse.type_info--; - break; - } - } ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -1343,7 +1343,7 @@ static void free_l1_table(struct page_in - - static int free_l2_table(struct page_info *page, int preemptible) - { --#ifdef CONFIG_COMPAT -+#if defined(CONFIG_COMPAT) || defined(DOMAIN_DESTRUCT_AVOID_RECURSION) - struct domain *d = page_get_owner(page); - #endif - unsigned long pfn = page_to_mfn(page); -@@ -1351,6 +1351,11 @@ static int free_l2_table(struct page_inf - unsigned int i = page->nr_validated_ptes - 1; - int err = 0; - -+#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION -+ if ( d->arch.relmem == RELMEM_l3 ) -+ return 0; -+#endif -+ - pl2e = map_domain_page(pfn); - - ASSERT(page->nr_validated_ptes); -@@ -1381,7 +1386,7 @@ static int free_l3_table(struct page_inf - int rc = 0; - - #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION -- if ( d->arch.relmem == RELMEM_l3 ) -+ if ( d->arch.relmem == RELMEM_l4 ) - return 0; - #endif - -@@ -1424,11 +1429,6 @@ static int free_l4_table(struct page_inf - unsigned int i = page->nr_validated_ptes - !page->partial_pte; - int rc = 0; - --#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION -- if ( d->arch.relmem == RELMEM_l4 ) -- return 0; --#endif -- - do { - if ( is_guest_l4_slot(d, i) ) - rc = put_page_from_l4e(pl4e[i], pfn, preemptible); -@@ -1940,7 +1940,6 @@ int free_page_type(struct page_info *pag - { - struct domain *owner = page_get_owner(page); - unsigned long gmfn; -- int rc; - - if ( likely(owner != NULL) ) - { -@@ -1973,34 +1972,39 @@ int free_page_type(struct page_info *pag - page->nr_validated_ptes = 1U << PAGETABLE_ORDER; - page->partial_pte = 0; - } -+ - switch ( type & PGT_type_mask ) - { - case PGT_l1_page_table: - free_l1_table(page); -- rc = 0; -- break; -+ return 0; - case PGT_l2_page_table: -- rc = free_l2_table(page, preemptible); -- break; -+ return free_l2_table(page, preemptible); - case PGT_l3_page_table: - #if CONFIG_PAGING_LEVELS == 3 - if ( !(type & PGT_partial) ) - page->nr_validated_ptes = L3_PAGETABLE_ENTRIES; - #endif -- rc = free_l3_table(page, preemptible); -- break; -+ return free_l3_table(page, preemptible); - case PGT_l4_page_table: -- rc = free_l4_table(page, preemptible); -- break; -- default: -- MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page)); -- rc = -EINVAL; -- BUG(); -+ return free_l4_table(page, preemptible); - } - -+ MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page)); -+ BUG(); -+ return -EINVAL; -+} -+ -+ -+static int __put_page_type_final(struct page_info *page, unsigned long type, -+ int preemptible) -+{ -+ int rc = free_page_type(page, type, preemptible); -+ - /* No need for atomic update of type_info here: noone else updates it. */ -- if ( rc == 0 ) -+ switch ( rc ) - { -+ case 0: - /* - * Record TLB information for flush later. We do not stamp page tables - * when running in shadow mode: -@@ -2013,9 +2017,8 @@ int free_page_type(struct page_info *pag - page->tlbflush_timestamp = tlbflush_current_time(); - wmb(); - page->u.inuse.type_info--; -- } -- else if ( rc == -EINTR ) -- { -+ break; -+ case -EINTR: - ASSERT(!(page->u.inuse.type_info & - (PGT_count_mask|PGT_validated|PGT_partial))); - if ( !(shadow_mode_enabled(page_get_owner(page)) && -@@ -2023,12 +2026,13 @@ int free_page_type(struct page_info *pag - page->tlbflush_timestamp = tlbflush_current_time(); - wmb(); - page->u.inuse.type_info |= PGT_validated; -- } -- else -- { -- BUG_ON(rc != -EAGAIN); -+ break; -+ case -EAGAIN: - wmb(); - page->u.inuse.type_info |= PGT_partial; -+ break; -+ default: -+ BUG(); - } - - return rc; -@@ -2062,7 +2066,7 @@ static int __put_page_type(struct page_i - x, nx)) != x) ) - continue; - /* We cleared the 'valid bit' so we do the clean up. */ -- return free_page_type(page, x, preemptible); -+ return __put_page_type_final(page, x, preemptible); - } - - /* diff --git a/x86-show-page-walk-early.patch b/x86-show-page-walk-early.patch index d6f218f..746ca6f 100644 --- a/x86-show-page-walk-early.patch +++ b/x86-show-page-walk-early.patch @@ -69,7 +69,7 @@ Index: xen-3.3.1-testing/xen/arch/x86/x86_64/mm.c =================================================================== --- xen-3.3.1-testing.orig/xen/arch/x86/x86_64/mm.c +++ xen-3.3.1-testing/xen/arch/x86/x86_64/mm.c -@@ -33,6 +33,7 @@ +@@ -32,6 +32,7 @@ #include #include @@ -77,7 +77,7 @@ Index: xen-3.3.1-testing/xen/arch/x86/x86_64/mm.c #ifdef CONFIG_COMPAT unsigned int m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START; #endif -@@ -152,6 +153,8 @@ void __init paging_init(void) +@@ -151,6 +152,8 @@ void __init paging_init(void) l2_ro_mpt++; } @@ -134,7 +134,7 @@ Index: xen-3.3.1-testing/xen/include/asm-x86/mm.h =================================================================== --- xen-3.3.1-testing.orig/xen/include/asm-x86/mm.h +++ xen-3.3.1-testing/xen/include/asm-x86/mm.h -@@ -307,6 +307,7 @@ TYPE_SAFE(unsigned long,mfn); +@@ -331,6 +331,7 @@ TYPE_SAFE(unsigned long,mfn); #define machine_to_phys_mapping ((unsigned long *)RDWR_MPT_VIRT_START) #define INVALID_M2P_ENTRY (~0UL) #define VALID_M2P(_e) (!((_e) & (1UL<<(BITS_PER_LONG-1)))) diff --git a/xen-3.3.1-testing-src.tar.bz2 b/xen-3.3.1-testing-src.tar.bz2 index 97dea05..bba000e 100644 --- a/xen-3.3.1-testing-src.tar.bz2 +++ b/xen-3.3.1-testing-src.tar.bz2 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ce2034661e3f994e29c466829910cfd89dba75ba71c4235d66a7940cf3956f5 -size 22695411 +oid sha256:e32c923438b84304bb99311df54aa02eb057b9dbcce2770f3b0310975b3ab066 +size 22694395 diff --git a/xen-lowmem-emergency-pool.diff b/xen-lowmem-emergency-pool.diff index 0d72162..576c7be 100644 --- a/xen-lowmem-emergency-pool.diff +++ b/xen-lowmem-emergency-pool.diff @@ -31,7 +31,7 @@ Index: xen-3.3.1-testing/xen/arch/x86/x86_32/mm.c + lowmem_emergency_pool_pages = 4000; } - unsigned long clone_idle_pagetable(struct vcpu *v) + void __init zap_low_mappings(l2_pgentry_t *dom0_l2) Index: xen-3.3.1-testing/xen/common/page_alloc.c =================================================================== --- xen-3.3.1-testing.orig/xen/common/page_alloc.c diff --git a/xen-x86-emulate-movnti.patch b/xen-x86-emulate-movnti.patch new file mode 100644 index 0000000..9a3341a --- /dev/null +++ b/xen-x86-emulate-movnti.patch @@ -0,0 +1,30 @@ +Index: xen-3.3.1-testing/xen/arch/x86/x86_emulate/x86_emulate.c +=================================================================== +--- xen-3.3.1-testing.orig/xen/arch/x86/x86_emulate/x86_emulate.c ++++ xen-3.3.1-testing/xen/arch/x86/x86_emulate/x86_emulate.c +@@ -3657,6 +3657,12 @@ x86_emulate( + case 8: *src.reg = dst.val; break; + } + goto add; ++ ++ case 0xc3: /* movnti */ ++ /* Ignore the non-temporal hint for now. */ ++ generate_exception_if(dst.bytes <= 2, EXC_UD, -1); ++ dst.val = src.val; ++ break; + } + goto writeback; + +@@ -3923,12 +3929,6 @@ x86_emulate( + src.val = x86_seg_gs; + goto pop_seg; + +- case 0xc3: /* movnti */ +- /* Ignore the non-temporal hint for now. */ +- generate_exception_if(dst.bytes <= 2, EXC_UD, -1); +- dst.val = src.val; +- break; +- + case 0xc7: /* Grp9 (cmpxchg8b/cmpxchg16b) */ { + unsigned long old[2], exp[2], new[2]; + unsigned int i; diff --git a/xen.changes b/xen.changes index dc22f06..d8a2d0d 100644 --- a/xen.changes +++ b/xen.changes @@ -1,3 +1,51 @@ +------------------------------------------------------------------- +Thu Nov 20 19:57:19 CET 2008 - kwolf@suse.de + +- bnc#444731 - Fix data corruption bug (caused by broken x86 + emulation for movnti instruction) + + xen-x86-emulate-movnti.patch + +------------------------------------------------------------------- +Wed Nov 19 20:14:54 CET 2008 - kwolf@suse.de + +- Report device model errors during the creation of snapshots + to xend instead of failing silently + +------------------------------------------------------------------- +Wed Nov 19 16:27:01 CET 2008 - kwolf@suse.de + +- bnc#445659 - ioemu: Workaround for VNC client initialization + race with xenfb changing the resolution (caused VNC connection + to be closed, vm-install recognized this as failed installation) + + ioemu-vnc-resize.patch + +------------------------------------------------------------------- +Tue Nov 18 08:11:34 MST 2008 - carnold@novell.com + +- bnc#444203 - With EPT mode4, HVM S3 causes Xen HV crash. + 18783-hvm-vcpu-reset-state-fix.patch + +------------------------------------------------------------------- +Mon Nov 17 09:14:12 MST 2008 - carnold@novell.com + +- bnc#444731 - Blackscreen instead of second stage during + installation + 18766-realmode-stack-size-fix.patch + +------------------------------------------------------------------- +Thu Nov 13 09:22:54 MST 2008 - carnold@novell.com + +- bnc#429739 - Network failure with bnx2 when booted to XEN + 18778-msi-irq-fix.patch + +------------------------------------------------------------------- +Wed Nov 12 19:07:11 CET 2008 - kwolf@suse.de + +- bnc#444197 - Add udev rule to fix domUloader race with + automounter (udev-rules.patch) + ------------------------------------------------------------------- Sun Nov 9 23:54:34 CET 2008 - ro@suse.de diff --git a/xen.spec b/xen.spec index dd6dc51..e157ae5 100644 --- a/xen.spec +++ b/xen.spec @@ -1,5 +1,5 @@ # -# spec file for package xen (Version 3.3.1_18486_01) +# spec file for package xen (Version 3.3.1_18494_02) # # Copyright (c) 2008 SUSE LINUX Products GmbH, Nuernberg, Germany. # @@ -22,7 +22,7 @@ Name: xen ExclusiveArch: %ix86 x86_64 %define xvers 3.3 %define xvermaj 3 -%define changeset 18486 +%define changeset 18494 %define xen_build_dir xen-3.3.1-testing %define with_kmp 0 BuildRequires: LibVNCServer-devel SDL-devel automake bin86 curl-devel dev86 graphviz latex2html libjpeg-devel libxml2-devel ncurses-devel openssl openssl-devel pciutils-devel python-devel transfig @@ -37,8 +37,8 @@ BuildRequires: glibc-32bit glibc-devel-32bit %if %{?with_kmp}0 BuildRequires: kernel-source kernel-syms module-init-tools xorg-x11 %endif -Version: 3.3.1_18486_01 -Release: 2 +Version: 3.3.1_18494_02 +Release: 1 License: GPL v2 only Group: System/Kernel AutoReqProv: on @@ -99,7 +99,20 @@ Patch29: 18637-vmx-set-dr7.patch Patch30: 18654-xend-vcpus.patch Patch31: 18656-vtd-alloc-checks.patch Patch32: 18661-recursive-spinlocks.patch -Patch33: 18745-xend-ioport-irq.patch +Patch33: 18720-x86-dom-cleanup.patch +Patch34: 18722-x86-fixmap-reserved.patch +Patch35: 18723-unmap-dom-page-const.patch +Patch36: 18724-i386-highmem-assist.patch +Patch37: 18731-x86-dom-cleanup.patch +Patch38: 18735-x86-dom-cleanup.patch +Patch39: 18741-x86-dom-cleanup-no-hack.patch +Patch40: 18742-x86-partial-page-ref.patch +Patch41: 18745-xend-ioport-irq.patch +Patch42: 18747-x86-partial-page-ref.patch +Patch43: 18771-reduce-GDT-switching.patch +Patch44: 18778-msi-irq-fix.patch +# Will be fixed in 3.3-testing soon +Patch90: xen-x86-emulate-movnti.patch # Our patches Patch100: xen-config.diff Patch101: xend-config.diff @@ -147,6 +160,8 @@ Patch159: tools-gdbserver-build.diff Patch160: network-route.patch # Needs to go upstream sometime, when python 2.6 is widespread Patch161: python2.6-fixes.patch +Patch162: udev-rules.patch +Patch163: ioemu-vnc-resize.patch # Patches for snapshot support Patch170: qemu-img-snapshot.patch Patch171: ioemu-blktap-fix-open.patch @@ -176,12 +191,8 @@ Patch350: pv-driver-build.patch Patch351: xen-ioemu-hvm-pv-support.diff Patch352: pvdrv_emulation_control.patch Patch353: blktap-pv-cdrom.patch -Patch354: i386-highmem-assist.patch -Patch355: x86-cpufreq-report.patch -Patch356: x86-dom-cleanup.patch -Patch357: x86-dom-cleanup-no-hack.patch -Patch358: x86-partial-page-ref.patch -Patch359: dom-print.patch +Patch354: x86-cpufreq-report.patch +Patch355: dom-print.patch # novell_shim patches Patch400: hv_tools.patch Patch401: hv_xen_base.patch @@ -251,7 +262,7 @@ Authors: License: GPL v2 or later Summary: Xen Virtualization: Libraries Group: System/Kernel -Requires: xen = %{version} +#Requires: xen = %{version} AutoReqProv: on %description libs @@ -551,6 +562,18 @@ Authors: %patch31 -p1 %patch32 -p1 %patch33 -p1 +%patch34 -p1 +%patch35 -p1 +%patch36 -p1 +%patch37 -p1 +%patch38 -p1 +%patch39 -p1 +%patch40 -p1 +%patch41 -p1 +%patch42 -p1 +%patch43 -p1 +%patch44 -p1 +%patch90 -p1 %patch100 -p1 %patch101 -p1 %patch102 -p1 @@ -596,6 +619,8 @@ Authors: %patch159 -p1 %patch160 -p1 %patch161 -p1 +%patch162 -p1 +%patch163 -p1 %patch170 -p1 %patch171 -p1 %patch172 -p1 @@ -623,10 +648,6 @@ Authors: %patch353 -p1 %patch354 -p1 %patch355 -p1 -%patch356 -p1 -%patch357 -p1 -%patch358 -p1 -%patch359 -p1 # Don't use shim for now %ifarch x86_64 %patch400 -p1 @@ -976,6 +997,31 @@ rm -f $RPM_BUILD_ROOT/%{_libdir}/xen/bin/qemu-dm.debug /sbin/ldconfig %changelog +* Thu Nov 20 2008 kwolf@suse.de +- bnc#444731 - Fix data corruption bug (caused by broken x86 + emulation for movnti instruction) + xen-x86-emulate-movnti.patch +* Wed Nov 19 2008 kwolf@suse.de +- Report device model errors during the creation of snapshots + to xend instead of failing silently +* Wed Nov 19 2008 kwolf@suse.de +- bnc#445659 - ioemu: Workaround for VNC client initialization + race with xenfb changing the resolution (caused VNC connection + to be closed, vm-install recognized this as failed installation) + ioemu-vnc-resize.patch +* Tue Nov 18 2008 carnold@novell.com +- bnc#444203 - With EPT mode4, HVM S3 causes Xen HV crash. + 18783-hvm-vcpu-reset-state-fix.patch +* Mon Nov 17 2008 carnold@novell.com +- bnc#444731 - Blackscreen instead of second stage during + installation + 18766-realmode-stack-size-fix.patch +* Thu Nov 13 2008 carnold@novell.com +- bnc#429739 - Network failure with bnx2 when booted to XEN + 18778-msi-irq-fix.patch +* Wed Nov 12 2008 kwolf@suse.de +- bnc#444197 - Add udev rule to fix domUloader race with + automounter (udev-rules.patch) * Sun Nov 09 2008 ro@suse.de - disable kmp to fix build again * Fri Nov 07 2008 kwolf@suse.de