412 lines
16 KiB
Diff
412 lines
16 KiB
Diff
# HG changeset patch
|
|
# User Keir Fraser <keir.fraser@citrix.com>
|
|
# Date 1231414359 0
|
|
# Node ID 292919f6123823916f1274f3d512794f72f3e903
|
|
# Parent 97f8d6453fdae1a865a3c875d7b712a494304fb0
|
|
x86-64: guest directed placement of initial p->m map
|
|
|
|
By adding another ELF note, the kernel can now direct the hypervisor
|
|
(for Dom0) and in the future also the tools (for DomU-s) to place the
|
|
initial phys->mach translation table at other than an address
|
|
immediately above the kernel/initrd images. This eliminates the size
|
|
restriction imposed on this table by Linux (the kernel loads above the
|
|
-2Gb boundary, and hence the entire initial mapping cannot reach or
|
|
even exceed 2Gb).
|
|
|
|
There are a few items in this patch I'm not particularly happy with,
|
|
but couldn't think of a better solution:
|
|
- there is a hidden assumption that pages allocated for the domain are
|
|
put on the domain's page list sequentially
|
|
- the way backward compatibility is maintained is placing requirements
|
|
on the kernel side that make the code somewhat convoluted (because
|
|
it
|
|
needs to check where the map is actually placed in quite a few
|
|
places)
|
|
- code is there to use 1Gb mappings for the hypervisor created table,
|
|
but lacking a machine with 512G+ memory for immediate testing I
|
|
can't
|
|
verify this works; I know that 2Mb mappings work, and hence imply
|
|
that 1Gb ones would too (of course, if the kernel replaces the table
|
|
- like Linux does -, it cannot use 2Mb/1Gb mappings or even try to
|
|
re-use the page table entries, but I don't consider this a problem)
|
|
|
|
Signed-off-by: Jan Beulich <jbeulich@novell.com>
|
|
|
|
--- a/tools/include/xen-foreign/reference.size
|
|
+++ b/tools/include/xen-foreign/reference.size
|
|
@@ -1,7 +1,7 @@
|
|
|
|
structs | x86_32 x86_64 ia64
|
|
|
|
-start_info | 1104 1152 1152
|
|
+start_info | 1112 1168 1168
|
|
trap_info | 8 16 -
|
|
pt_fpreg | - - 16
|
|
cpu_user_regs | 68 200 -
|
|
--- a/xen/arch/x86/domain_build.c
|
|
+++ b/xen/arch/x86/domain_build.c
|
|
@@ -341,6 +341,12 @@ int __init construct_dom0(
|
|
#endif
|
|
}
|
|
|
|
+ if ( (parms.p2m_base != UNSET_ADDR) && elf_32bit(&elf) )
|
|
+ {
|
|
+ printk(XENLOG_WARNING "P2M table base ignored\n");
|
|
+ parms.p2m_base = UNSET_ADDR;
|
|
+ }
|
|
+
|
|
domain_set_alloc_bitsize(d);
|
|
|
|
/*
|
|
@@ -359,6 +365,8 @@ int __init construct_dom0(
|
|
vphysmap_end = vphysmap_start + (nr_pages * (!is_pv_32on64_domain(d) ?
|
|
sizeof(unsigned long) :
|
|
sizeof(unsigned int)));
|
|
+ if ( parms.p2m_base != UNSET_ADDR )
|
|
+ vphysmap_end = vphysmap_start;
|
|
vstartinfo_start = round_pgup(vphysmap_end);
|
|
vstartinfo_end = (vstartinfo_start +
|
|
sizeof(struct start_info) +
|
|
@@ -400,6 +408,11 @@ int __init construct_dom0(
|
|
/* Ensure that our low-memory 1:1 mapping covers the allocation. */
|
|
page = alloc_domheap_pages(d, order, MEMF_bits(30));
|
|
#else
|
|
+ if ( parms.p2m_base != UNSET_ADDR )
|
|
+ {
|
|
+ vphysmap_start = parms.p2m_base;
|
|
+ vphysmap_end = vphysmap_start + nr_pages * sizeof(unsigned long);
|
|
+ }
|
|
page = alloc_domheap_pages(d, order, 0);
|
|
#endif
|
|
if ( page == NULL )
|
|
@@ -749,8 +762,109 @@ int __init construct_dom0(
|
|
snprintf(si->magic, sizeof(si->magic), "xen-3.0-x86_%d%s",
|
|
elf_64bit(&elf) ? 64 : 32, parms.pae ? "p" : "");
|
|
|
|
+ count = d->tot_pages;
|
|
+#ifdef __x86_64__
|
|
+ /* Set up the phys->machine table if not part of the initial mapping. */
|
|
+ if ( parms.p2m_base != UNSET_ADDR )
|
|
+ {
|
|
+ unsigned long va = vphysmap_start;
|
|
+
|
|
+ if ( v_start <= vphysmap_end && vphysmap_start <= v_end )
|
|
+ panic("DOM0 P->M table overlaps initial mapping");
|
|
+
|
|
+ while ( va < vphysmap_end )
|
|
+ {
|
|
+ if ( d->tot_pages + ((round_pgup(vphysmap_end) - va)
|
|
+ >> PAGE_SHIFT) + 3 > nr_pages )
|
|
+ panic("Dom0 allocation too small for initial P->M table.\n");
|
|
+
|
|
+ l4tab = l4start + l4_table_offset(va);
|
|
+ if ( !l4e_get_intpte(*l4tab) )
|
|
+ {
|
|
+ page = alloc_domheap_pages(d, 0, 0);
|
|
+ if ( !page )
|
|
+ break;
|
|
+ /* No mapping, PGC_allocated + page-table page. */
|
|
+ page->count_info = PGC_allocated | 2;
|
|
+ page->u.inuse.type_info =
|
|
+ PGT_l3_page_table | PGT_validated | 1;
|
|
+ clear_page(page_to_virt(page));
|
|
+ *l4tab = l4e_from_page(page, L4_PROT);
|
|
+ }
|
|
+ l3tab = page_to_virt(l4e_get_page(*l4tab));
|
|
+ l3tab += l3_table_offset(va);
|
|
+ if ( !l3e_get_intpte(*l3tab) )
|
|
+ {
|
|
+ if ( cpu_has_page1gb &&
|
|
+ !(va & ((1UL << L3_PAGETABLE_SHIFT) - 1)) &&
|
|
+ vphysmap_end >= va + (1UL << L3_PAGETABLE_SHIFT) &&
|
|
+ (page = alloc_domheap_pages(d,
|
|
+ L3_PAGETABLE_SHIFT -
|
|
+ PAGE_SHIFT,
|
|
+ 0)) != NULL )
|
|
+ {
|
|
+ *l3tab = l3e_from_page(page,
|
|
+ L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
|
|
+ va += 1UL << L3_PAGETABLE_SHIFT;
|
|
+ continue;
|
|
+ }
|
|
+ if ( (page = alloc_domheap_pages(d, 0, 0)) == NULL )
|
|
+ break;
|
|
+ else
|
|
+ {
|
|
+ /* No mapping, PGC_allocated + page-table page. */
|
|
+ page->count_info = PGC_allocated | 2;
|
|
+ page->u.inuse.type_info =
|
|
+ PGT_l2_page_table | PGT_validated | 1;
|
|
+ clear_page(page_to_virt(page));
|
|
+ *l3tab = l3e_from_page(page, L3_PROT);
|
|
+ }
|
|
+ }
|
|
+ l2tab = page_to_virt(l3e_get_page(*l3tab));
|
|
+ l2tab += l2_table_offset(va);
|
|
+ if ( !l2e_get_intpte(*l2tab) )
|
|
+ {
|
|
+ if ( !(va & ((1UL << L2_PAGETABLE_SHIFT) - 1)) &&
|
|
+ vphysmap_end >= va + (1UL << L2_PAGETABLE_SHIFT) &&
|
|
+ (page = alloc_domheap_pages(d,
|
|
+ L2_PAGETABLE_SHIFT -
|
|
+ PAGE_SHIFT,
|
|
+ 0)) != NULL )
|
|
+ {
|
|
+ *l2tab = l2e_from_page(page,
|
|
+ L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
|
|
+ va += 1UL << L2_PAGETABLE_SHIFT;
|
|
+ continue;
|
|
+ }
|
|
+ if ( (page = alloc_domheap_pages(d, 0, 0)) == NULL )
|
|
+ break;
|
|
+ else
|
|
+ {
|
|
+ /* No mapping, PGC_allocated + page-table page. */
|
|
+ page->count_info = PGC_allocated | 2;
|
|
+ page->u.inuse.type_info =
|
|
+ PGT_l1_page_table | PGT_validated | 1;
|
|
+ clear_page(page_to_virt(page));
|
|
+ *l2tab = l2e_from_page(page, L2_PROT);
|
|
+ }
|
|
+ }
|
|
+ l1tab = page_to_virt(l2e_get_page(*l2tab));
|
|
+ l1tab += l1_table_offset(va);
|
|
+ BUG_ON(l1e_get_intpte(*l1tab));
|
|
+ page = alloc_domheap_pages(d, 0, 0);
|
|
+ if ( !page )
|
|
+ break;
|
|
+ *l1tab = l1e_from_page(page, L1_PROT|_PAGE_DIRTY);
|
|
+ va += PAGE_SIZE;
|
|
+ va &= PAGE_MASK;
|
|
+ }
|
|
+ if ( !page )
|
|
+ panic("Not enough RAM for DOM0 P->M table.\n");
|
|
+ }
|
|
+#endif
|
|
+
|
|
/* Write the phys->machine and machine->phys table entries. */
|
|
- for ( pfn = 0; pfn < d->tot_pages; pfn++ )
|
|
+ for ( pfn = 0; pfn < count; pfn++ )
|
|
{
|
|
mfn = pfn + alloc_spfn;
|
|
#ifndef NDEBUG
|
|
@@ -764,6 +878,26 @@ int __init construct_dom0(
|
|
((unsigned int *)vphysmap_start)[pfn] = mfn;
|
|
set_gpfn_from_mfn(mfn, pfn);
|
|
}
|
|
+ si->first_p2m_pfn = pfn;
|
|
+ si->nr_p2m_frames = d->tot_pages - count;
|
|
+ list_for_each_entry ( page, &d->page_list, list )
|
|
+ {
|
|
+ mfn = page_to_mfn(page);
|
|
+ if ( get_gpfn_from_mfn(mfn) >= count )
|
|
+ {
|
|
+ BUG_ON(is_pv_32bit_domain(d));
|
|
+ if ( !page->u.inuse.type_info &&
|
|
+ !get_page_and_type(page, d, PGT_writable_page) )
|
|
+ BUG();
|
|
+ ((unsigned long *)vphysmap_start)[pfn] = mfn;
|
|
+ set_gpfn_from_mfn(mfn, pfn);
|
|
+ ++pfn;
|
|
+#ifndef NDEBUG
|
|
+ ++alloc_epfn;
|
|
+#endif
|
|
+ }
|
|
+ }
|
|
+ BUG_ON(pfn != d->tot_pages);
|
|
while ( pfn < nr_pages )
|
|
{
|
|
if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
|
|
--- a/xen/arch/x86/mm.c
|
|
+++ b/xen/arch/x86/mm.c
|
|
@@ -948,19 +948,44 @@ void put_page_from_l1e(l1_pgentry_t l1e,
|
|
}
|
|
|
|
|
|
+static void put_data_page(
|
|
+ struct page_info *page, int writeable)
|
|
+{
|
|
+ if ( writeable )
|
|
+ put_page_and_type(page);
|
|
+ else
|
|
+ {
|
|
+ BUG_ON((page->u.inuse.type_info & PGT_type_mask) == PGT_seg_desc_page
|
|
+ && (page->u.inuse.type_info & PGT_count_mask) != 0);
|
|
+ put_page(page);
|
|
+ }
|
|
+}
|
|
+
|
|
/*
|
|
* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
|
|
* Note also that this automatically deals correctly with linear p.t.'s.
|
|
*/
|
|
static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
|
|
{
|
|
- if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
|
|
- (l2e_get_pfn(l2e) != pfn) )
|
|
+ if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
|
|
+ return 1;
|
|
+
|
|
+ if ( unlikely(l2e_get_flags(l2e) & _PAGE_PSE) )
|
|
+ {
|
|
+ unsigned long mfn = l2e_get_pfn(l2e);
|
|
+ int writeable = l2e_get_flags(l2e) & _PAGE_RW;
|
|
+
|
|
+ ASSERT(!(mfn & ((1UL << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
|
|
+ do {
|
|
+ put_data_page(mfn_to_page(mfn), writeable);
|
|
+ } while ( ++mfn & ((1UL << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) );
|
|
+ }
|
|
+ else
|
|
{
|
|
put_page_and_type(l2e_get_page(l2e));
|
|
- return 0;
|
|
}
|
|
- return 1;
|
|
+
|
|
+ return 0;
|
|
}
|
|
|
|
static int __put_page_type(struct page_info *, int preemptible);
|
|
@@ -968,14 +993,28 @@ static int __put_page_type(struct page_i
|
|
static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
|
|
int partial, int preemptible)
|
|
{
|
|
- if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
|
|
- (l3e_get_pfn(l3e) != pfn) )
|
|
+ if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
|
|
+ return 1;
|
|
+
|
|
+#ifdef __x86_64__
|
|
+ if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) )
|
|
{
|
|
- if ( unlikely(partial > 0) )
|
|
- return __put_page_type(l3e_get_page(l3e), preemptible);
|
|
- return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
|
|
+ unsigned long mfn = l3e_get_pfn(l3e);
|
|
+ int writeable = l3e_get_flags(l3e) & _PAGE_RW;
|
|
+
|
|
+ ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
|
|
+ do {
|
|
+ put_data_page(mfn_to_page(mfn), writeable);
|
|
+ } while ( ++mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) );
|
|
+
|
|
+ return 0;
|
|
}
|
|
- return 1;
|
|
+#endif
|
|
+
|
|
+ if ( unlikely(partial > 0) )
|
|
+ return __put_page_type(l3e_get_page(l3e), preemptible);
|
|
+
|
|
+ return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
|
|
}
|
|
|
|
#if CONFIG_PAGING_LEVELS >= 4
|
|
--- a/xen/common/libelf/libelf-dominfo.c
|
|
+++ b/xen/common/libelf/libelf-dominfo.c
|
|
@@ -90,6 +90,7 @@ int elf_xen_parse_note(struct elf_binary
|
|
[XEN_ELFNOTE_ENTRY] = { "ENTRY", 0},
|
|
[XEN_ELFNOTE_HYPERCALL_PAGE] = { "HYPERCALL_PAGE", 0},
|
|
[XEN_ELFNOTE_VIRT_BASE] = { "VIRT_BASE", 0},
|
|
+ [XEN_ELFNOTE_INIT_P2M] = { "INIT_P2M", 0},
|
|
[XEN_ELFNOTE_PADDR_OFFSET] = { "PADDR_OFFSET", 0},
|
|
[XEN_ELFNOTE_HV_START_LOW] = { "HV_START_LOW", 0},
|
|
[XEN_ELFNOTE_XEN_VERSION] = { "XEN_VERSION", 1},
|
|
@@ -164,6 +165,9 @@ int elf_xen_parse_note(struct elf_binary
|
|
case XEN_ELFNOTE_ENTRY:
|
|
parms->virt_entry = val;
|
|
break;
|
|
+ case XEN_ELFNOTE_INIT_P2M:
|
|
+ parms->p2m_base = val;
|
|
+ break;
|
|
case XEN_ELFNOTE_PADDR_OFFSET:
|
|
parms->elf_paddr_offset = val;
|
|
break;
|
|
@@ -392,6 +396,7 @@ static int elf_xen_addr_calc_check(struc
|
|
elf_msg(elf, " virt_kstart = 0x%" PRIx64 "\n", parms->virt_kstart);
|
|
elf_msg(elf, " virt_kend = 0x%" PRIx64 "\n", parms->virt_kend);
|
|
elf_msg(elf, " virt_entry = 0x%" PRIx64 "\n", parms->virt_entry);
|
|
+ elf_msg(elf, " p2m_base = 0x%" PRIx64 "\n", parms->p2m_base);
|
|
|
|
if ( (parms->virt_kstart > parms->virt_kend) ||
|
|
(parms->virt_entry < parms->virt_kstart) ||
|
|
@@ -403,6 +408,15 @@ static int elf_xen_addr_calc_check(struc
|
|
return -1;
|
|
}
|
|
|
|
+ if ( (parms->p2m_base != UNSET_ADDR) &&
|
|
+ (parms->p2m_base >= parms->virt_kstart) &&
|
|
+ (parms->p2m_base < parms->virt_kend) )
|
|
+ {
|
|
+ elf_err(elf, "%s: ERROR: P->M table base is out of bounds.\n",
|
|
+ __FUNCTION__);
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
return 0;
|
|
}
|
|
|
|
@@ -422,6 +436,7 @@ int elf_xen_parse(struct elf_binary *elf
|
|
parms->virt_entry = UNSET_ADDR;
|
|
parms->virt_hypercall = UNSET_ADDR;
|
|
parms->virt_hv_start_low = UNSET_ADDR;
|
|
+ parms->p2m_base = UNSET_ADDR;
|
|
parms->elf_paddr_offset = UNSET_ADDR;
|
|
|
|
/* Find and parse elf notes. */
|
|
--- a/xen/include/public/elfnote.h
|
|
+++ b/xen/include/public/elfnote.h
|
|
@@ -162,9 +162,20 @@
|
|
#define XEN_ELFNOTE_SUSPEND_CANCEL 14
|
|
|
|
/*
|
|
+ * The (non-default) location the initial phys-to-machine map should be
|
|
+ * placed at by the hypervisor (Dom0) or the tools (DomU).
|
|
+ * The kernel must be prepared for this mapping to be established using
|
|
+ * large pages, despite such otherwise not being available to guests.
|
|
+ * The kernel must also be able to handle the page table pages used for
|
|
+ * this mapping not being accessible through the initial mapping.
|
|
+ * (Only x86-64 supports this at present.)
|
|
+ */
|
|
+#define XEN_ELFNOTE_INIT_P2M 15
|
|
+
|
|
+/*
|
|
* The number of the highest elfnote defined.
|
|
*/
|
|
-#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUSPEND_CANCEL
|
|
+#define XEN_ELFNOTE_MAX XEN_ELFNOTE_INIT_P2M
|
|
|
|
/*
|
|
* System information exported through crash notes.
|
|
--- a/xen/include/public/libelf.h
|
|
+++ b/xen/include/public/libelf.h
|
|
@@ -232,6 +232,7 @@ struct elf_dom_parms {
|
|
uint64_t virt_entry;
|
|
uint64_t virt_hypercall;
|
|
uint64_t virt_hv_start_low;
|
|
+ uint64_t p2m_base;
|
|
uint64_t elf_paddr_offset;
|
|
uint32_t f_supported[XENFEAT_NR_SUBMAPS];
|
|
uint32_t f_required[XENFEAT_NR_SUBMAPS];
|
|
--- a/xen/include/public/xen.h
|
|
+++ b/xen/include/public/xen.h
|
|
@@ -513,6 +513,7 @@ typedef struct shared_info shared_info_t
|
|
* a. relocated kernel image
|
|
* b. initial ram disk [mod_start, mod_len]
|
|
* c. list of allocated page frames [mfn_list, nr_pages]
|
|
+ * (unless relocated due to XEN_ELFNOTE_INIT_P2M)
|
|
* d. start_info_t structure [register ESI (x86)]
|
|
* e. bootstrap page tables [pt_base, CR3 (x86)]
|
|
* f. bootstrap stack [register ESP (x86)]
|
|
@@ -554,6 +555,9 @@ struct start_info {
|
|
unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */
|
|
unsigned long mod_len; /* Size (bytes) of pre-loaded module. */
|
|
int8_t cmd_line[MAX_GUEST_CMDLINE];
|
|
+ /* The pfn range here covers both page table and p->m table frames. */
|
|
+ unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table. */
|
|
+ unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table. */
|
|
};
|
|
typedef struct start_info start_info_t;
|
|
|