--- 2009-01-08.orig/tools/include/xen-foreign/reference.size 2009-01-08 11:44:11.000000000 +0100 +++ 2009-01-08/tools/include/xen-foreign/reference.size 2009-01-08 10:56:30.000000000 +0100 @@ -1,7 +1,7 @@ structs | x86_32 x86_64 ia64 -start_info | 1104 1152 1152 +start_info | 1112 1168 1168 trap_info | 8 16 - pt_fpreg | - - 16 cpu_user_regs | 68 200 - --- 2009-01-08.orig/xen/arch/x86/domain_build.c 2009-01-08 10:56:13.000000000 +0100 +++ 2009-01-08/xen/arch/x86/domain_build.c 2009-01-08 11:44:42.000000000 +0100 @@ -341,6 +341,12 @@ int __init construct_dom0( #endif } + if ( (parms.p2m_base != UNSET_ADDR) && elf_32bit(&elf) ) + { + printk(XENLOG_WARNING "P2M table base ignored\n"); + parms.p2m_base = UNSET_ADDR; + } + domain_set_alloc_bitsize(d); /* @@ -359,6 +365,8 @@ int __init construct_dom0( vphysmap_end = vphysmap_start + (nr_pages * (!is_pv_32on64_domain(d) ? sizeof(unsigned long) : sizeof(unsigned int))); + if ( parms.p2m_base != UNSET_ADDR ) + vphysmap_end = vphysmap_start; vstartinfo_start = round_pgup(vphysmap_end); vstartinfo_end = (vstartinfo_start + sizeof(struct start_info) + @@ -400,6 +408,11 @@ int __init construct_dom0( /* Ensure that our low-memory 1:1 mapping covers the allocation. */ page = alloc_domheap_pages(d, order, MEMF_bits(30)); #else + if ( parms.p2m_base != UNSET_ADDR ) + { + vphysmap_start = parms.p2m_base; + vphysmap_end = vphysmap_start + nr_pages * sizeof(unsigned long); + } page = alloc_domheap_pages(d, order, 0); #endif if ( page == NULL ) @@ -740,8 +753,109 @@ int __init construct_dom0( snprintf(si->magic, sizeof(si->magic), "xen-3.0-x86_%d%s", elf_64bit(&elf) ? 64 : 32, parms.pae ? "p" : ""); + count = d->tot_pages; +#ifdef __x86_64__ + /* Set up the phys->machine table if not part of the initial mapping. */ + if ( parms.p2m_base != UNSET_ADDR ) + { + unsigned long va = vphysmap_start; + + if ( v_start <= vphysmap_end && vphysmap_start <= v_end ) + panic("DOM0 P->M table overlaps initial mapping"); + + while ( va < vphysmap_end ) + { + if ( d->tot_pages + ((round_pgup(vphysmap_end) - va) + >> PAGE_SHIFT) + 3 > nr_pages ) + panic("Dom0 allocation too small for initial P->M table.\n"); + + l4tab = l4start + l4_table_offset(va); + if ( !l4e_get_intpte(*l4tab) ) + { + page = alloc_domheap_pages(d, 0, 0); + if ( !page ) + break; + /* No mapping, PGC_allocated + page-table page. */ + page->count_info = PGC_allocated | 2; + page->u.inuse.type_info = + PGT_l3_page_table | PGT_validated | 1; + clear_page(page_to_virt(page)); + *l4tab = l4e_from_page(page, L4_PROT); + } + l3tab = page_to_virt(l4e_get_page(*l4tab)); + l3tab += l3_table_offset(va); + if ( !l3e_get_intpte(*l3tab) ) + { + if ( cpu_has_page1gb && + !(va & ((1UL << L3_PAGETABLE_SHIFT) - 1)) && + vphysmap_end >= va + (1UL << L3_PAGETABLE_SHIFT) && + (page = alloc_domheap_pages(d, + L3_PAGETABLE_SHIFT - + PAGE_SHIFT, + 0)) != NULL ) + { + *l3tab = l3e_from_page(page, + L1_PROT|_PAGE_DIRTY|_PAGE_PSE); + va += 1UL << L3_PAGETABLE_SHIFT; + continue; + } + else if ( (page = alloc_domheap_pages(d, 0, 0)) == NULL ) + break; + else + { + /* No mapping, PGC_allocated + page-table page. */ + page->count_info = PGC_allocated | 2; + page->u.inuse.type_info = + PGT_l2_page_table | PGT_validated | 1; + clear_page(page_to_virt(page)); + *l3tab = l3e_from_page(page, L3_PROT); + } + } + l2tab = page_to_virt(l3e_get_page(*l3tab)); + l2tab += l2_table_offset(va); + if ( !l2e_get_intpte(*l2tab) ) + { + if ( !(va & ((1UL << L2_PAGETABLE_SHIFT) - 1)) && + vphysmap_end >= va + (1UL << L2_PAGETABLE_SHIFT) && + (page = alloc_domheap_pages(d, + L2_PAGETABLE_SHIFT - + PAGE_SHIFT, + 0)) != NULL ) + { + *l2tab = l2e_from_page(page, + L1_PROT|_PAGE_DIRTY|_PAGE_PSE); + va += 1UL << L2_PAGETABLE_SHIFT; + continue; + } + else if ( (page = alloc_domheap_pages(d, 0, 0)) == NULL ) + break; + else + { + /* No mapping, PGC_allocated + page-table page. */ + page->count_info = PGC_allocated | 2; + page->u.inuse.type_info = + PGT_l1_page_table | PGT_validated | 1; + clear_page(page_to_virt(page)); + *l2tab = l2e_from_page(page, L2_PROT); + } + } + l1tab = page_to_virt(l2e_get_page(*l2tab)); + l1tab += l1_table_offset(va); + BUG_ON(l1e_get_intpte(*l1tab)); + page = alloc_domheap_pages(d, 0, 0); + if ( !page ) + break; + *l1tab = l1e_from_page(page, L1_PROT|_PAGE_DIRTY); + va += PAGE_SIZE; + va &= PAGE_MASK; + } + if ( !page ) + panic("Not enough RAM for DOM0 P->M table.\n"); + } +#endif + /* Write the phys->machine and machine->phys table entries. */ - for ( pfn = 0; pfn < d->tot_pages; pfn++ ) + for ( pfn = 0; pfn < count; pfn++ ) { mfn = pfn + alloc_spfn; #ifndef NDEBUG @@ -755,6 +869,26 @@ int __init construct_dom0( ((unsigned int *)vphysmap_start)[pfn] = mfn; set_gpfn_from_mfn(mfn, pfn); } + si->first_p2m_pfn = pfn; + si->nr_p2m_frames = d->tot_pages - count; + list_for_each_entry ( page, &d->page_list, list ) + { + mfn = page_to_mfn(page); + if ( get_gpfn_from_mfn(mfn) >= count ) + { + BUG_ON(is_pv_32bit_domain(d)); + if ( !page->u.inuse.type_info && + !get_page_and_type(page, d, PGT_writable_page) ) + BUG(); + ((unsigned long *)vphysmap_start)[pfn] = mfn; + set_gpfn_from_mfn(mfn, pfn); + ++pfn; +#ifndef NDEBUG + ++alloc_epfn; +#endif + } + } + BUG_ON(pfn != d->tot_pages); while ( pfn < nr_pages ) { if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL ) --- 2009-01-08.orig/xen/arch/x86/mm.c 2009-01-08 11:44:11.000000000 +0100 +++ 2009-01-08/xen/arch/x86/mm.c 2009-01-08 10:56:30.000000000 +0100 @@ -1013,7 +1013,8 @@ static int put_page_from_l2e(l2_pgentry_ { unsigned long mfn = l2e_get_pfn(l2e), m = mfn; int writeable = l2e_get_flags(l2e) & _PAGE_RW; - ASSERT(opt_allow_hugepage && !(mfn & (L1_PAGETABLE_ENTRIES-1))); + + ASSERT(!(mfn & (L1_PAGETABLE_ENTRIES-1))); do { put_data_page(mfn_to_page(m), writeable); } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) ); @@ -1031,14 +1032,28 @@ static int __put_page_type(struct page_i static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, int partial, int preemptible) { - if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && - (l3e_get_pfn(l3e) != pfn) ) + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) ) + return 1; + +#ifdef __x86_64__ + if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) ) { - if ( unlikely(partial > 0) ) - return __put_page_type(l3e_get_page(l3e), preemptible); - return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible); + unsigned long mfn = l3e_get_pfn(l3e); + int writeable = l3e_get_flags(l3e) & _PAGE_RW; + + ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))); + do { + put_data_page(mfn_to_page(mfn), writeable); + } while ( ++mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) ); + + return 0; } - return 1; +#endif + + if ( unlikely(partial > 0) ) + return __put_page_type(l3e_get_page(l3e), preemptible); + + return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible); } #if CONFIG_PAGING_LEVELS >= 4 --- 2009-01-08.orig/xen/common/libelf/libelf-dominfo.c 2009-01-08 11:44:11.000000000 +0100 +++ 2009-01-08/xen/common/libelf/libelf-dominfo.c 2009-01-08 10:56:30.000000000 +0100 @@ -90,6 +90,7 @@ int elf_xen_parse_note(struct elf_binary [XEN_ELFNOTE_ENTRY] = { "ENTRY", 0}, [XEN_ELFNOTE_HYPERCALL_PAGE] = { "HYPERCALL_PAGE", 0}, [XEN_ELFNOTE_VIRT_BASE] = { "VIRT_BASE", 0}, + [XEN_ELFNOTE_INIT_P2M] = { "INIT_P2M", 0}, [XEN_ELFNOTE_PADDR_OFFSET] = { "PADDR_OFFSET", 0}, [XEN_ELFNOTE_HV_START_LOW] = { "HV_START_LOW", 0}, [XEN_ELFNOTE_XEN_VERSION] = { "XEN_VERSION", 1}, @@ -164,6 +165,9 @@ int elf_xen_parse_note(struct elf_binary case XEN_ELFNOTE_ENTRY: parms->virt_entry = val; break; + case XEN_ELFNOTE_INIT_P2M: + parms->p2m_base = val; + break; case XEN_ELFNOTE_PADDR_OFFSET: parms->elf_paddr_offset = val; break; @@ -392,6 +396,7 @@ static int elf_xen_addr_calc_check(struc elf_msg(elf, " virt_kstart = 0x%" PRIx64 "\n", parms->virt_kstart); elf_msg(elf, " virt_kend = 0x%" PRIx64 "\n", parms->virt_kend); elf_msg(elf, " virt_entry = 0x%" PRIx64 "\n", parms->virt_entry); + elf_msg(elf, " p2m_base = 0x%" PRIx64 "\n", parms->p2m_base); if ( (parms->virt_kstart > parms->virt_kend) || (parms->virt_entry < parms->virt_kstart) || @@ -403,6 +408,15 @@ static int elf_xen_addr_calc_check(struc return -1; } + if ( (parms->p2m_base != UNSET_ADDR) && + (parms->p2m_base >= parms->virt_kstart) && + (parms->p2m_base < parms->virt_kend) ) + { + elf_err(elf, "%s: ERROR: P->M table base is out of bounds.\n", + __FUNCTION__); + return -1; + } + return 0; } @@ -422,6 +436,7 @@ int elf_xen_parse(struct elf_binary *elf parms->virt_entry = UNSET_ADDR; parms->virt_hypercall = UNSET_ADDR; parms->virt_hv_start_low = UNSET_ADDR; + parms->p2m_base = UNSET_ADDR; parms->elf_paddr_offset = UNSET_ADDR; /* Find and parse elf notes. */ --- 2009-01-08.orig/xen/include/public/elfnote.h 2009-01-08 11:44:11.000000000 +0100 +++ 2009-01-08/xen/include/public/elfnote.h 2009-01-08 10:56:30.000000000 +0100 @@ -162,9 +162,20 @@ #define XEN_ELFNOTE_SUSPEND_CANCEL 14 /* + * The (non-default) location the initial phys-to-machine map should be + * placed at by the hypervisor (Dom0) or the tools (DomU). + * The kernel must be prepared for this mapping to be established using + * large pages, despite such otherwise not being available to guests. + * The kernel must also be able to handle the page table pages used for + * this mapping not being accessible through the initial mapping. + * (Only x86-64 supports this at present.) + */ +#define XEN_ELFNOTE_INIT_P2M 15 + +/* * The number of the highest elfnote defined. */ -#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUSPEND_CANCEL +#define XEN_ELFNOTE_MAX XEN_ELFNOTE_INIT_P2M /* * System information exported through crash notes. --- 2009-01-08.orig/xen/include/public/libelf.h 2009-01-08 11:44:11.000000000 +0100 +++ 2009-01-08/xen/include/public/libelf.h 2009-01-08 10:56:30.000000000 +0100 @@ -232,6 +232,7 @@ struct elf_dom_parms { uint64_t virt_entry; uint64_t virt_hypercall; uint64_t virt_hv_start_low; + uint64_t p2m_base; uint64_t elf_paddr_offset; uint32_t f_supported[XENFEAT_NR_SUBMAPS]; uint32_t f_required[XENFEAT_NR_SUBMAPS]; --- 2009-01-08.orig/xen/include/public/xen.h 2009-01-08 11:44:11.000000000 +0100 +++ 2009-01-08/xen/include/public/xen.h 2009-01-08 10:56:30.000000000 +0100 @@ -513,6 +513,7 @@ typedef struct shared_info shared_info_t * a. relocated kernel image * b. initial ram disk [mod_start, mod_len] * c. list of allocated page frames [mfn_list, nr_pages] + * (unless relocated due to XEN_ELFNOTE_INIT_P2M) * d. start_info_t structure [register ESI (x86)] * e. bootstrap page tables [pt_base, CR3 (x86)] * f. bootstrap stack [register ESP (x86)] @@ -554,6 +555,9 @@ struct start_info { unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */ unsigned long mod_len; /* Size (bytes) of pre-loaded module. */ int8_t cmd_line[MAX_GUEST_CMDLINE]; + /* The pfn range here covers both page table and p->m table frames. */ + unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table. */ + unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table. */ }; typedef struct start_info start_info_t;