From: Olaf Hering Date: Mon, 7 Aug 2017 12:58:02 +0000 Subject: libxc sr restore hvm legacy superpage tools: use superpages during restore of HVM guest bsc#1035231 - migration of HVM domU does not use superpages on destination dom0 bsc#1055695 - XEN: 11SP4 and 12SP3 HVM guests can not be restored During creating of a HVM domU meminit_hvm() tries to map superpages. After save/restore or migration this mapping is lost, everything is allocated in single pages. This causes a performance degradation after migration. Add neccessary code to preallocate a superpage for an incoming chunk of pfns. In case a pfn was not populated on the sending side, it must be freed on the receiving side to avoid over-allocation. The existing code for x86_pv is moved unmodified into its own file. Signed-off-by: Olaf Hering --- tools/libs/guest/xg_dom_x86.c | 5 - tools/libs/guest/xg_private.h | 5 + tools/libs/guest/xg_sr_common.h | 28 +- tools/libs/guest/xg_sr_restore.c | 60 +--- tools/libs/guest/xg_sr_restore_x86_hvm.c | 381 ++++++++++++++++++++++- tools/libs/guest/xg_sr_restore_x86_pv.c | 61 +++- 6 files changed, 467 insertions(+), 73 deletions(-) --- a/tools/libs/guest/xg_dom_x86.c +++ b/tools/libs/guest/xg_dom_x86.c @@ -44,11 +44,6 @@ #define SUPERPAGE_BATCH_SIZE 512 -#define SUPERPAGE_2MB_SHIFT 9 -#define SUPERPAGE_2MB_NR_PFNS (1UL << SUPERPAGE_2MB_SHIFT) -#define SUPERPAGE_1GB_SHIFT 18 -#define SUPERPAGE_1GB_NR_PFNS (1UL << SUPERPAGE_1GB_SHIFT) - #define X86_CR0_PE 0x01 #define X86_CR0_ET 0x10 --- a/tools/libs/guest/xg_private.h +++ b/tools/libs/guest/xg_private.h @@ -180,4 +180,9 @@ struct xc_cpu_policy { }; #endif /* x86 */ +#define SUPERPAGE_2MB_SHIFT 9 +#define SUPERPAGE_2MB_NR_PFNS (1UL << SUPERPAGE_2MB_SHIFT) +#define SUPERPAGE_1GB_SHIFT 18 +#define SUPERPAGE_1GB_NR_PFNS (1UL << SUPERPAGE_1GB_SHIFT) + #endif /* XG_PRIVATE_H */ --- a/tools/libs/guest/xg_sr_common.h +++ b/tools/libs/guest/xg_sr_common.h @@ -208,6 +208,16 @@ struct xc_sr_restore_ops int (*setup)(struct xc_sr_context *ctx); /** + * Populate PFNs + * + * Given a set of pfns, obtain memory from Xen to fill the physmap for the + * unpopulated subset. + */ + int (*populate_pfns)(struct xc_sr_context *ctx, unsigned count, + const xen_pfn_t *original_pfns, const uint32_t *types); + + + /** * Process an individual record from the stream. The caller shall take * care of processing common records (e.g. END, PAGE_DATA). * @@ -338,6 +348,8 @@ struct xc_sr_context int send_back_fd; unsigned long p2m_size; + unsigned long max_pages; + unsigned long tot_pages; xc_hypercall_buffer_t dirty_bitmap_hbuf; /* From Image Header. */ @@ -471,6 +483,14 @@ struct xc_sr_context { /* HVM context blob. */ struct xc_sr_blob context; + + /* Bitmap of currently allocated PFNs during restore. */ + struct sr_bitmap attempted_1g; + struct sr_bitmap attempted_2m; + struct sr_bitmap allocated_pfns; + xen_pfn_t prev_populated_pfn; + xen_pfn_t iteration_tracker_pfn; + unsigned long iteration; } restore; }; } hvm; @@ -535,14 +555,6 @@ int read_record_header(struct xc_sr_cont int read_record_data(struct xc_sr_context *ctx, int fd, struct xc_sr_rhdr *rhdr, struct xc_sr_record *rec); -/* - * This would ideally be private in restore.c, but is needed by - * x86_pv_localise_page() if we receive pagetables frames ahead of the - * contents of the frames they point at. - */ -int populate_pfns(struct xc_sr_context *ctx, unsigned int count, - const xen_pfn_t *original_pfns, const uint32_t *types); - /* Handle a STATIC_DATA_END record. */ int handle_static_data_end(struct xc_sr_context *ctx); --- a/tools/libs/guest/xg_sr_restore.c +++ b/tools/libs/guest/xg_sr_restore.c @@ -71,60 +71,6 @@ static int read_headers(struct xc_sr_con return 0; } -/* - * Given a set of pfns, obtain memory from Xen to fill the physmap for the - * unpopulated subset. If types is NULL, no page type checking is performed - * and all unpopulated pfns are populated. - */ -int populate_pfns(struct xc_sr_context *ctx, unsigned int count, - const xen_pfn_t *original_pfns, const uint32_t *types) -{ - xc_interface *xch = ctx->xch; - unsigned int i, nr_pfns = 0; - int rc = -1; - - for ( i = 0; i < count; ++i ) - { - if ( (!types || page_type_to_populate(types[i])) && - !pfn_is_populated(ctx, original_pfns[i]) ) - { - rc = pfn_set_populated(ctx, original_pfns[i]); - if ( rc ) - goto err; - ctx->restore.pp_pfns[nr_pfns] = ctx->restore.pp_mfns[nr_pfns] = original_pfns[i]; - ++nr_pfns; - } - } - - if ( nr_pfns ) - { - rc = xc_domain_populate_physmap_exact( - xch, ctx->domid, nr_pfns, 0, 0, ctx->restore.pp_mfns); - if ( rc ) - { - PERROR("Failed to populate physmap"); - goto err; - } - - for ( i = 0; i < nr_pfns; ++i ) - { - if ( ctx->restore.pp_mfns[i] == INVALID_MFN ) - { - ERROR("Populate physmap failed for pfn %u", i); - rc = -1; - goto err; - } - - ctx->restore.ops.set_gfn(ctx, ctx->restore.pp_pfns[i], ctx->restore.pp_mfns[i]); - } - } - - rc = 0; - - err: - return rc; -} - static int handle_static_data_end_v2(struct xc_sr_context *ctx) { int rc = 0; @@ -259,7 +205,8 @@ static int map_guest_pages(struct xc_sr_ uint32_t i, p; int rc; - rc = populate_pfns(ctx, pages->count, ctx->restore.pfns, ctx->restore.types); + rc = ctx->restore.ops.populate_pfns(ctx, pages->count, ctx->restore.pfns, + ctx->restore.types); if ( rc ) { ERROR("Failed to populate pfns for batch of %u pages", pages->count); @@ -1074,6 +1021,9 @@ int xc_domain_restore(xc_interface *xch, return -1; } + /* See xc_domain_getinfo */ + ctx.restore.max_pages = ctx.dominfo.max_pages; + ctx.restore.tot_pages = ctx.dominfo.tot_pages; ctx.restore.p2m_size = nr_pfns; ctx.restore.ops = hvm ? restore_ops_x86_hvm : restore_ops_x86_pv; --- a/tools/libs/guest/xg_sr_restore_x86_hvm.c +++ b/tools/libs/guest/xg_sr_restore_x86_hvm.c @@ -130,6 +130,33 @@ static int x86_hvm_localise_page(struct return 0; } +static bool x86_hvm_expand_sp_bitmaps(struct xc_sr_context *ctx, unsigned long max_pfn) +{ + struct sr_bitmap *bm; + + bm = &ctx->x86.hvm.restore.attempted_1g; + if ( !sr_bitmap_expand(bm, max_pfn >> SUPERPAGE_1GB_SHIFT) ) + return false; + + bm = &ctx->x86.hvm.restore.attempted_2m; + if ( !sr_bitmap_expand(bm, max_pfn >> SUPERPAGE_2MB_SHIFT) ) + return false; + + bm = &ctx->x86.hvm.restore.allocated_pfns; + if ( !sr_bitmap_expand(bm, max_pfn) ) + return false; + + return true; +} + +static void x86_hvm_no_superpage(struct xc_sr_context *ctx, unsigned long addr) +{ + unsigned long pfn = addr >> XC_PAGE_SHIFT; + + sr_set_bit(pfn >> SUPERPAGE_1GB_SHIFT, &ctx->x86.hvm.restore.attempted_1g); + sr_set_bit(pfn >> SUPERPAGE_2MB_SHIFT, &ctx->x86.hvm.restore.attempted_2m); +} + /* * restore_ops function. Confirms the stream matches the domain. */ @@ -164,12 +191,24 @@ static int x86_hvm_setup(struct xc_sr_co max_pfn = max(ctx->restore.p2m_size, max_pages); if ( !sr_bitmap_expand(&ctx->restore.populated_pfns, max_pfn) ) - { - PERROR("Unable to allocate memory for populated_pfns bitmap"); - return -1; - } + goto out; + + if ( !x86_hvm_expand_sp_bitmaps(ctx, max_pfn) ) + goto out; + + /* FIXME: distinguish between PVH and HVM */ + /* No superpage in 1st 2MB due to VGA hole */ + x86_hvm_no_superpage(ctx, 0xA0000u); +#define LAPIC_BASE_ADDRESS 0xfee00000u +#define ACPI_INFO_PHYSICAL_ADDRESS 0xfc000000u + x86_hvm_no_superpage(ctx, LAPIC_BASE_ADDRESS); + x86_hvm_no_superpage(ctx, ACPI_INFO_PHYSICAL_ADDRESS); return 0; + +out: + PERROR("Unable to allocate memory for pfn bitmaps"); + return -1; } /* @@ -250,6 +289,9 @@ static int x86_hvm_stream_complete(struc static int x86_hvm_cleanup(struct xc_sr_context *ctx) { sr_bitmap_free(&ctx->restore.populated_pfns); + sr_bitmap_free(&ctx->x86.hvm.restore.attempted_1g); + sr_bitmap_free(&ctx->x86.hvm.restore.attempted_2m); + sr_bitmap_free(&ctx->x86.hvm.restore.allocated_pfns); free(ctx->x86.hvm.restore.context.ptr); free(ctx->x86.restore.cpuid.ptr); @@ -258,6 +300,336 @@ static int x86_hvm_cleanup(struct xc_sr_ return 0; } +/* + * Set a range of pfns as allocated + */ +static void pfn_set_long_allocated(struct xc_sr_context *ctx, xen_pfn_t base_pfn) +{ + sr_set_long_bit(base_pfn, &ctx->x86.hvm.restore.allocated_pfns); +} + +static void pfn_set_allocated(struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + sr_set_bit(pfn, &ctx->x86.hvm.restore.allocated_pfns); +} + +struct x86_hvm_sp { + xen_pfn_t pfn; + xen_pfn_t base_pfn; + unsigned long index; + unsigned long count; +}; + +/* + * Try to allocate a 1GB page for this pfn, but avoid Over-allocation. + * If this succeeds, mark the range of 2MB pages as busy. + */ +static bool x86_hvm_alloc_1g(struct xc_sr_context *ctx, struct x86_hvm_sp *sp) +{ + xc_interface *xch = ctx->xch; + unsigned int order; + int i, done; + xen_pfn_t extent; + + /* Only one attempt to avoid overlapping allocation */ + if ( sr_test_and_set_bit(sp->index, &ctx->x86.hvm.restore.attempted_1g) ) + return false; + + order = SUPERPAGE_1GB_SHIFT; + sp->count = SUPERPAGE_1GB_NR_PFNS; + + /* Allocate only if there is room for another superpage */ + if ( ctx->restore.tot_pages + sp->count > ctx->restore.max_pages ) + return false; + + extent = sp->base_pfn = (sp->pfn >> order) << order; + done = xc_domain_populate_physmap(xch, ctx->domid, 1, order, 0, &extent); + if ( done < 0 ) { + PERROR("populate_physmap failed."); + return false; + } + if ( done == 0 ) + return false; + + DPRINTF("1G %" PRI_xen_pfn "\n", sp->base_pfn); + + /* Mark all 2MB pages as done to avoid overlapping allocation */ + for ( i = 0; i < (SUPERPAGE_1GB_NR_PFNS/SUPERPAGE_2MB_NR_PFNS); i++ ) + sr_set_bit((sp->base_pfn >> SUPERPAGE_2MB_SHIFT) + i, &ctx->x86.hvm.restore.attempted_2m); + + return true; +} + +/* Allocate a 2MB page if x86_hvm_alloc_1g failed, avoid Over-allocation. */ +static bool x86_hvm_alloc_2m(struct xc_sr_context *ctx, struct x86_hvm_sp *sp) +{ + xc_interface *xch = ctx->xch; + unsigned int order; + int done; + xen_pfn_t extent; + + /* Only one attempt to avoid overlapping allocation */ + if ( sr_test_and_set_bit(sp->index, &ctx->x86.hvm.restore.attempted_2m) ) + return false; + + order = SUPERPAGE_2MB_SHIFT; + sp->count = SUPERPAGE_2MB_NR_PFNS; + + /* Allocate only if there is room for another superpage */ + if ( ctx->restore.tot_pages + sp->count > ctx->restore.max_pages ) + return false; + + extent = sp->base_pfn = (sp->pfn >> order) << order; + done = xc_domain_populate_physmap(xch, ctx->domid, 1, order, 0, &extent); + if ( done < 0 ) { + PERROR("populate_physmap failed."); + return false; + } + if ( done == 0 ) + return false; + + DPRINTF("2M %" PRI_xen_pfn "\n", sp->base_pfn); + return true; +} + +/* Allocate a single page if x86_hvm_alloc_2m failed. */ +static bool x86_hvm_alloc_4k(struct xc_sr_context *ctx, struct x86_hvm_sp *sp) +{ + xc_interface *xch = ctx->xch; + unsigned int order; + int done; + xen_pfn_t extent; + + order = 0; + sp->count = 1UL; + + /* Allocate only if there is room for another page */ + if ( ctx->restore.tot_pages + sp->count > ctx->restore.max_pages ) { + errno = E2BIG; + return false; + } + + extent = sp->base_pfn = (sp->pfn >> order) << order; + done = xc_domain_populate_physmap(xch, ctx->domid, 1, order, 0, &extent); + if ( done < 0 ) { + PERROR("populate_physmap failed."); + return false; + } + if ( done == 0 ) { + errno = ENOMEM; + return false; + } + + DPRINTF("4K %" PRI_xen_pfn "\n", sp->base_pfn); + return true; +} +/* + * Attempt to allocate a superpage where the pfn resides. + */ +static int x86_hvm_allocate_pfn(struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + bool success; + unsigned long idx_1g, idx_2m; + struct x86_hvm_sp sp = { + .pfn = pfn + }; + + if ( sr_test_bit(pfn, &ctx->x86.hvm.restore.allocated_pfns) ) + return 0; + + idx_1g = pfn >> SUPERPAGE_1GB_SHIFT; + idx_2m = pfn >> SUPERPAGE_2MB_SHIFT; + + sp.index = idx_1g; + success = x86_hvm_alloc_1g(ctx, &sp); + + if ( success == false ) { + sp.index = idx_2m; + success = x86_hvm_alloc_2m(ctx, &sp); + } + + if ( success == false ) { + sp.index = 0; + success = x86_hvm_alloc_4k(ctx, &sp); + } + + if ( success == false ) + return -1; + + do { + if ( sp.count >= BITS_PER_LONG && (sp.count % BITS_PER_LONG) == 0 ) { + sp.count -= BITS_PER_LONG; + ctx->restore.tot_pages += BITS_PER_LONG; + pfn_set_long_allocated(ctx, sp.base_pfn + sp.count); + } else { + sp.count--; + ctx->restore.tot_pages++; + pfn_set_allocated(ctx, sp.base_pfn + sp.count); + } + } while ( sp.count ); + + return 0; +} + +/* + * Deallocate memory. + * There was likely an optimistic superpage allocation. + * This means more pages may have been allocated past gap_end. + * This range is not freed now. Incoming higher pfns will release it. + */ +static int x86_hvm_punch_hole(struct xc_sr_context *ctx, + xen_pfn_t gap_start, xen_pfn_t gap_end) +{ + xc_interface *xch = ctx->xch; + xen_pfn_t _pfn, pfn; + uint32_t domid, freed = 0; + int rc; + + pfn = gap_start >> SUPERPAGE_1GB_SHIFT; + do + { + sr_set_bit(pfn, &ctx->x86.hvm.restore.attempted_1g); + } while (++pfn <= gap_end >> SUPERPAGE_1GB_SHIFT); + + pfn = gap_start >> SUPERPAGE_2MB_SHIFT; + do + { + sr_set_bit(pfn, &ctx->x86.hvm.restore.attempted_2m); + } while (++pfn <= gap_end >> SUPERPAGE_2MB_SHIFT); + + pfn = gap_start; + + while ( pfn <= gap_end ) + { + if ( sr_test_and_clear_bit(pfn, &ctx->x86.hvm.restore.allocated_pfns) ) + { + domid = ctx->domid; + _pfn = pfn; + rc = xc_domain_decrease_reservation_exact(xch, domid, 1, 0, &_pfn); + if ( rc ) + { + PERROR("Failed to release pfn %" PRI_xen_pfn, pfn); + return -1; + } + ctx->restore.tot_pages--; + freed++; + } + pfn++; + } + if ( freed ) + DPRINTF("freed %u between %" PRI_xen_pfn " %" PRI_xen_pfn "\n", + freed, gap_start, gap_end); + return 0; +} + +static int x86_hvm_unpopulate_page(struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + sr_clear_bit(pfn, &ctx->restore.populated_pfns); + return x86_hvm_punch_hole(ctx, pfn, pfn); +} + +static int x86_hvm_populate_page(struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + xen_pfn_t gap_start, gap_end; + bool has_gap, first_iteration; + int rc; + + /* + * Check for a gap between the previous populated pfn and this pfn. + * In case a gap exists, it is required to punch a hole to release memory, + * starting after the previous pfn and before this pfn. + * + * But: this can be done only during the first iteration, which is the + * only place where superpage allocations are attempted. All following + * iterations lack the info to properly maintain prev_populated_pfn. + */ + has_gap = ctx->x86.hvm.restore.prev_populated_pfn + 1 < pfn; + first_iteration = ctx->x86.hvm.restore.iteration == 0; + if ( has_gap && first_iteration ) + { + gap_start = ctx->x86.hvm.restore.prev_populated_pfn + 1; + gap_end = pfn - 1; + + rc = x86_hvm_punch_hole(ctx, gap_start, gap_end); + if ( rc ) + goto err; + } + + rc = x86_hvm_allocate_pfn(ctx, pfn); + if ( rc ) + goto err; + pfn_set_populated(ctx, pfn); + ctx->x86.hvm.restore.prev_populated_pfn = pfn; + + rc = 0; +err: + return rc; +} + +/* + * Try to allocate superpages. + * This works without memory map because the pfns arrive in incremental order. + * All pfn numbers and their type are submitted. + * Only pfns with data will have also pfn content transmitted. + */ +static int x86_hvm_populate_pfns(struct xc_sr_context *ctx, unsigned count, + const xen_pfn_t *original_pfns, + const uint32_t *types) +{ + xc_interface *xch = ctx->xch; + xen_pfn_t pfn, min_pfn, max_pfn; + bool to_populate, populated; + unsigned i = count; + int rc = 0; + + min_pfn = count ? original_pfns[0] : 0; + max_pfn = count ? original_pfns[count - 1] : 0; + DPRINTF("batch of %u pfns between %" PRI_xen_pfn " %" PRI_xen_pfn "\n", + count, min_pfn, max_pfn); + + if ( !x86_hvm_expand_sp_bitmaps(ctx, max_pfn) ) + { + ERROR("Unable to allocate memory for pfn bitmaps"); + return -1; + } + + /* + * There is no indicator for a new iteration. + * Simulate it by checking if a lower pfn is coming in. + * In the end it matters only to know if this iteration is the first one. + */ + if ( min_pfn < ctx->x86.hvm.restore.iteration_tracker_pfn ) + ctx->x86.hvm.restore.iteration++; + ctx->x86.hvm.restore.iteration_tracker_pfn = min_pfn; + + for ( i = 0; i < count; ++i ) + { + pfn = original_pfns[i]; + + to_populate = page_type_to_populate(types[i]); + populated = pfn_is_populated(ctx, pfn); + + /* + * page has data, pfn populated: nothing to do + * page has data, pfn not populated: likely never seen before + * page has no data, pfn populated: likely ballooned out during migration + * page has no data, pfn not populated: nothing to do + */ + if ( to_populate && !populated ) + { + rc = x86_hvm_populate_page(ctx, pfn); + } else if ( !to_populate && populated ) + { + rc = x86_hvm_unpopulate_page(ctx, pfn); + } + if ( rc ) + break; + } + + return rc; +} + + struct xc_sr_restore_ops restore_ops_x86_hvm = { .pfn_is_valid = x86_hvm_pfn_is_valid, @@ -266,6 +638,7 @@ struct xc_sr_restore_ops restore_ops_x86 .set_page_type = x86_hvm_set_page_type, .localise_page = x86_hvm_localise_page, .setup = x86_hvm_setup, + .populate_pfns = x86_hvm_populate_pfns, .process_record = x86_hvm_process_record, .static_data_complete = x86_static_data_complete, .stream_complete = x86_hvm_stream_complete, --- a/tools/libs/guest/xg_sr_restore_x86_pv.c +++ b/tools/libs/guest/xg_sr_restore_x86_pv.c @@ -960,6 +960,64 @@ static void x86_pv_set_gfn(struct xc_sr_ } /* + * Given a set of pfns, obtain memory from Xen to fill the physmap for the + * unpopulated subset. If types is NULL, no page type checking is performed + * and all unpopulated pfns are populated. + */ +static int x86_pv_populate_pfns(struct xc_sr_context *ctx, unsigned count, + const xen_pfn_t *original_pfns, + const uint32_t *types) +{ + xc_interface *xch = ctx->xch; + xen_pfn_t *mfns = ctx->restore.pp_mfns, + *pfns = ctx->restore.pp_pfns; + unsigned int i, nr_pfns = 0; + int rc = -1; + + for ( i = 0; i < count; ++i ) + { + if ( (!types || + (types && page_type_has_stream_data(types[i]) == true)) && + !pfn_is_populated(ctx, original_pfns[i]) ) + { + rc = pfn_set_populated(ctx, original_pfns[i]); + if ( rc ) + goto err; + pfns[nr_pfns] = mfns[nr_pfns] = original_pfns[i]; + ++nr_pfns; + } + } + + if ( nr_pfns ) + { + rc = xc_domain_populate_physmap_exact( + xch, ctx->domid, nr_pfns, 0, 0, mfns); + if ( rc ) + { + PERROR("Failed to populate physmap"); + goto err; + } + + for ( i = 0; i < nr_pfns; ++i ) + { + if ( mfns[i] == INVALID_MFN ) + { + ERROR("Populate physmap failed for pfn %u", i); + rc = -1; + goto err; + } + + ctx->restore.ops.set_gfn(ctx, pfns[i], mfns[i]); + } + } + + rc = 0; + + err: + return rc; +} + +/* * restore_ops function. Convert pfns back to mfns in pagetables. Possibly * needs to populate new frames if a PTE is found referring to a frame which * hasn't yet been seen from PAGE_DATA records. @@ -1003,7 +1061,7 @@ static int x86_pv_localise_page(struct x } } - if ( to_populate && populate_pfns(ctx, to_populate, pfns, NULL) ) + if ( to_populate && x86_pv_populate_pfns(ctx, to_populate, pfns, NULL) ) return -1; for ( i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i ) @@ -1200,6 +1258,7 @@ struct xc_sr_restore_ops restore_ops_x86 .set_gfn = x86_pv_set_gfn, .localise_page = x86_pv_localise_page, .setup = x86_pv_setup, + .populate_pfns = x86_pv_populate_pfns, .process_record = x86_pv_process_record, .static_data_complete = x86_static_data_complete, .stream_complete = x86_pv_stream_complete,