--- 2009-01-08.orig/tools/include/xen-foreign/reference.size	2009-01-08 11:44:11.000000000 +0100
+++ 2009-01-08/tools/include/xen-foreign/reference.size	2009-01-08 10:56:30.000000000 +0100
@@ -1,7 +1,7 @@
 
 structs                   |  x86_32  x86_64    ia64
 
-start_info                |    1104    1152    1152
+start_info                |    1112    1168    1168
 trap_info                 |       8      16       -
 pt_fpreg                  |       -       -      16
 cpu_user_regs             |      68     200       -
--- 2009-01-08.orig/xen/arch/x86/domain_build.c	2009-01-08 10:56:13.000000000 +0100
+++ 2009-01-08/xen/arch/x86/domain_build.c	2009-01-08 11:44:42.000000000 +0100
@@ -341,6 +341,12 @@ int __init construct_dom0(
 #endif
     }
 
+    if ( (parms.p2m_base != UNSET_ADDR) && elf_32bit(&elf) )
+    {
+        printk(XENLOG_WARNING "P2M table base ignored\n");
+        parms.p2m_base = UNSET_ADDR;
+    }
+
     domain_set_alloc_bitsize(d);
 
     /*
@@ -359,6 +365,8 @@ int __init construct_dom0(
     vphysmap_end     = vphysmap_start + (nr_pages * (!is_pv_32on64_domain(d) ?
                                                      sizeof(unsigned long) :
                                                      sizeof(unsigned int)));
+    if ( parms.p2m_base != UNSET_ADDR )
+        vphysmap_end = vphysmap_start;
     vstartinfo_start = round_pgup(vphysmap_end);
     vstartinfo_end   = (vstartinfo_start +
                         sizeof(struct start_info) +
@@ -400,6 +408,11 @@ int __init construct_dom0(
     /* Ensure that our low-memory 1:1 mapping covers the allocation. */
     page = alloc_domheap_pages(d, order, MEMF_bits(30));
 #else
+    if ( parms.p2m_base != UNSET_ADDR )
+    {
+        vphysmap_start = parms.p2m_base;
+        vphysmap_end   = vphysmap_start + nr_pages * sizeof(unsigned long);
+    }
     page = alloc_domheap_pages(d, order, 0);
 #endif
     if ( page == NULL )
@@ -740,8 +753,109 @@ int __init construct_dom0(
     snprintf(si->magic, sizeof(si->magic), "xen-3.0-x86_%d%s",
              elf_64bit(&elf) ? 64 : 32, parms.pae ? "p" : "");
 
+    count = d->tot_pages;
+#ifdef __x86_64__
+    /* Set up the phys->machine table if not part of the initial mapping. */
+    if ( parms.p2m_base != UNSET_ADDR )
+    {
+        unsigned long va = vphysmap_start;
+
+        if ( v_start <= vphysmap_end && vphysmap_start <= v_end )
+            panic("DOM0 P->M table overlaps initial mapping");
+
+        while ( va < vphysmap_end )
+        {
+            if ( d->tot_pages + ((round_pgup(vphysmap_end) - va)
+                                 >> PAGE_SHIFT) + 3 > nr_pages )
+                panic("Dom0 allocation too small for initial P->M table.\n");
+
+            l4tab = l4start + l4_table_offset(va);
+            if ( !l4e_get_intpte(*l4tab) )
+            {
+                page = alloc_domheap_pages(d, 0, 0);
+                if ( !page )
+                    break;
+                /* No mapping, PGC_allocated + page-table page. */
+                page->count_info = PGC_allocated | 2;
+                page->u.inuse.type_info =
+                    PGT_l3_page_table | PGT_validated | 1;
+                clear_page(page_to_virt(page));
+                *l4tab = l4e_from_page(page, L4_PROT);
+            }
+            l3tab = page_to_virt(l4e_get_page(*l4tab));
+            l3tab += l3_table_offset(va);
+            if ( !l3e_get_intpte(*l3tab) )
+            {
+                if ( cpu_has_page1gb &&
+                     !(va & ((1UL << L3_PAGETABLE_SHIFT) - 1)) &&
+                     vphysmap_end >= va + (1UL << L3_PAGETABLE_SHIFT) &&
+                     (page = alloc_domheap_pages(d,
+                                                 L3_PAGETABLE_SHIFT -
+                                                     PAGE_SHIFT,
+                                                 0)) != NULL )
+                {
+                    *l3tab = l3e_from_page(page,
+                                           L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
+                    va += 1UL << L3_PAGETABLE_SHIFT;
+                    continue;
+                }
+                else if ( (page = alloc_domheap_pages(d, 0, 0)) == NULL )
+                    break;
+                else
+                {
+                    /* No mapping, PGC_allocated + page-table page. */
+                    page->count_info = PGC_allocated | 2;
+                    page->u.inuse.type_info =
+                        PGT_l2_page_table | PGT_validated | 1;
+                    clear_page(page_to_virt(page));
+                    *l3tab = l3e_from_page(page, L3_PROT);
+                }
+            }
+            l2tab = page_to_virt(l3e_get_page(*l3tab));
+            l2tab += l2_table_offset(va);
+            if ( !l2e_get_intpte(*l2tab) )
+            {
+                if ( !(va & ((1UL << L2_PAGETABLE_SHIFT) - 1)) &&
+                     vphysmap_end >= va + (1UL << L2_PAGETABLE_SHIFT) &&
+                     (page = alloc_domheap_pages(d,
+                                                 L2_PAGETABLE_SHIFT -
+                                                     PAGE_SHIFT,
+                                                 0)) != NULL )
+                {
+                    *l2tab = l2e_from_page(page,
+                                           L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
+                    va += 1UL << L2_PAGETABLE_SHIFT;
+                    continue;
+                }
+                else if ( (page = alloc_domheap_pages(d, 0, 0)) == NULL )
+                    break;
+                else
+                {
+                    /* No mapping, PGC_allocated + page-table page. */
+                    page->count_info = PGC_allocated | 2;
+                    page->u.inuse.type_info =
+                        PGT_l1_page_table | PGT_validated | 1;
+                    clear_page(page_to_virt(page));
+                    *l2tab = l2e_from_page(page, L2_PROT);
+                }
+            }
+            l1tab = page_to_virt(l2e_get_page(*l2tab));
+            l1tab += l1_table_offset(va);
+            BUG_ON(l1e_get_intpte(*l1tab));
+            page = alloc_domheap_pages(d, 0, 0);
+            if ( !page )
+                break;
+            *l1tab = l1e_from_page(page, L1_PROT|_PAGE_DIRTY);
+            va += PAGE_SIZE;
+            va &= PAGE_MASK;
+        }
+        if ( !page )
+            panic("Not enough RAM for DOM0 P->M table.\n");
+    }
+#endif
+
     /* Write the phys->machine and machine->phys table entries. */
-    for ( pfn = 0; pfn < d->tot_pages; pfn++ )
+    for ( pfn = 0; pfn < count; pfn++ )
     {
         mfn = pfn + alloc_spfn;
 #ifndef NDEBUG
@@ -755,6 +869,26 @@ int __init construct_dom0(
             ((unsigned int *)vphysmap_start)[pfn] = mfn;
         set_gpfn_from_mfn(mfn, pfn);
     }
+    si->first_p2m_pfn = pfn;
+    si->nr_p2m_frames = d->tot_pages - count;
+    list_for_each_entry ( page, &d->page_list, list )
+    {
+        mfn = page_to_mfn(page);
+        if ( get_gpfn_from_mfn(mfn) >= count )
+        {
+            BUG_ON(is_pv_32bit_domain(d));
+            if ( !page->u.inuse.type_info &&
+                 !get_page_and_type(page, d, PGT_writable_page) )
+                BUG();
+            ((unsigned long *)vphysmap_start)[pfn] = mfn;
+            set_gpfn_from_mfn(mfn, pfn);
+            ++pfn;
+#ifndef NDEBUG
+            ++alloc_epfn;
+#endif
+        }
+    }
+    BUG_ON(pfn != d->tot_pages);
     while ( pfn < nr_pages )
     {
         if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
--- 2009-01-08.orig/xen/arch/x86/mm.c	2009-01-08 11:44:11.000000000 +0100
+++ 2009-01-08/xen/arch/x86/mm.c	2009-01-08 10:56:30.000000000 +0100
@@ -1013,7 +1013,8 @@ static int put_page_from_l2e(l2_pgentry_
     {
         unsigned long mfn = l2e_get_pfn(l2e), m = mfn;
         int writeable = l2e_get_flags(l2e) & _PAGE_RW;
-        ASSERT(opt_allow_hugepage && !(mfn & (L1_PAGETABLE_ENTRIES-1)));
+
+        ASSERT(!(mfn & (L1_PAGETABLE_ENTRIES-1)));
         do {
             put_data_page(mfn_to_page(m), writeable);
         } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
@@ -1031,14 +1032,28 @@ static int __put_page_type(struct page_i
 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
                              int partial, int preemptible)
 {
-    if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && 
-         (l3e_get_pfn(l3e) != pfn) )
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
+        return 1;
+
+#ifdef __x86_64__
+    if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) )
     {
-        if ( unlikely(partial > 0) )
-            return __put_page_type(l3e_get_page(l3e), preemptible);
-        return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
+        unsigned long mfn = l3e_get_pfn(l3e);
+        int writeable = l3e_get_flags(l3e) & _PAGE_RW;
+
+        ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
+        do {
+            put_data_page(mfn_to_page(mfn), writeable);
+        } while ( ++mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) );
+
+        return 0;
     }
-    return 1;
+#endif
+
+    if ( unlikely(partial > 0) )
+        return __put_page_type(l3e_get_page(l3e), preemptible);
+
+    return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
 }
 
 #if CONFIG_PAGING_LEVELS >= 4
--- 2009-01-08.orig/xen/common/libelf/libelf-dominfo.c	2009-01-08 11:44:11.000000000 +0100
+++ 2009-01-08/xen/common/libelf/libelf-dominfo.c	2009-01-08 10:56:30.000000000 +0100
@@ -90,6 +90,7 @@ int elf_xen_parse_note(struct elf_binary
         [XEN_ELFNOTE_ENTRY] = { "ENTRY", 0},
         [XEN_ELFNOTE_HYPERCALL_PAGE] = { "HYPERCALL_PAGE", 0},
         [XEN_ELFNOTE_VIRT_BASE] = { "VIRT_BASE", 0},
+        [XEN_ELFNOTE_INIT_P2M] = { "INIT_P2M", 0},
         [XEN_ELFNOTE_PADDR_OFFSET] = { "PADDR_OFFSET", 0},
         [XEN_ELFNOTE_HV_START_LOW] = { "HV_START_LOW", 0},
         [XEN_ELFNOTE_XEN_VERSION] = { "XEN_VERSION", 1},
@@ -164,6 +165,9 @@ int elf_xen_parse_note(struct elf_binary
     case XEN_ELFNOTE_ENTRY:
         parms->virt_entry = val;
         break;
+    case XEN_ELFNOTE_INIT_P2M:
+        parms->p2m_base = val;
+        break;
     case XEN_ELFNOTE_PADDR_OFFSET:
         parms->elf_paddr_offset = val;
         break;
@@ -392,6 +396,7 @@ static int elf_xen_addr_calc_check(struc
     elf_msg(elf, "    virt_kstart      = 0x%" PRIx64 "\n", parms->virt_kstart);
     elf_msg(elf, "    virt_kend        = 0x%" PRIx64 "\n", parms->virt_kend);
     elf_msg(elf, "    virt_entry       = 0x%" PRIx64 "\n", parms->virt_entry);
+    elf_msg(elf, "    p2m_base         = 0x%" PRIx64 "\n", parms->p2m_base);
 
     if ( (parms->virt_kstart > parms->virt_kend) ||
          (parms->virt_entry < parms->virt_kstart) ||
@@ -403,6 +408,15 @@ static int elf_xen_addr_calc_check(struc
         return -1;
     }
 
+    if ( (parms->p2m_base != UNSET_ADDR) &&
+         (parms->p2m_base >= parms->virt_kstart) &&
+         (parms->p2m_base < parms->virt_kend) )
+    {
+        elf_err(elf, "%s: ERROR: P->M table base is out of bounds.\n",
+                __FUNCTION__);
+        return -1;
+    }
+
     return 0;
 }
 
@@ -422,6 +436,7 @@ int elf_xen_parse(struct elf_binary *elf
     parms->virt_entry = UNSET_ADDR;
     parms->virt_hypercall = UNSET_ADDR;
     parms->virt_hv_start_low = UNSET_ADDR;
+    parms->p2m_base = UNSET_ADDR;
     parms->elf_paddr_offset = UNSET_ADDR;
 
     /* Find and parse elf notes. */
--- 2009-01-08.orig/xen/include/public/elfnote.h	2009-01-08 11:44:11.000000000 +0100
+++ 2009-01-08/xen/include/public/elfnote.h	2009-01-08 10:56:30.000000000 +0100
@@ -162,9 +162,20 @@
 #define XEN_ELFNOTE_SUSPEND_CANCEL 14
 
 /*
+ * The (non-default) location the initial phys-to-machine map should be
+ * placed at by the hypervisor (Dom0) or the tools (DomU).
+ * The kernel must be prepared for this mapping to be established using
+ * large pages, despite such otherwise not being available to guests.
+ * The kernel must also be able to handle the page table pages used for
+ * this mapping not being accessible through the initial mapping.
+ * (Only x86-64 supports this at present.)
+ */
+#define XEN_ELFNOTE_INIT_P2M      15
+
+/*
  * The number of the highest elfnote defined.
  */
-#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUSPEND_CANCEL
+#define XEN_ELFNOTE_MAX XEN_ELFNOTE_INIT_P2M
 
 /*
  * System information exported through crash notes.
--- 2009-01-08.orig/xen/include/public/libelf.h	2009-01-08 11:44:11.000000000 +0100
+++ 2009-01-08/xen/include/public/libelf.h	2009-01-08 10:56:30.000000000 +0100
@@ -232,6 +232,7 @@ struct elf_dom_parms {
     uint64_t virt_entry;
     uint64_t virt_hypercall;
     uint64_t virt_hv_start_low;
+    uint64_t p2m_base;
     uint64_t elf_paddr_offset;
     uint32_t f_supported[XENFEAT_NR_SUBMAPS];
     uint32_t f_required[XENFEAT_NR_SUBMAPS];
--- 2009-01-08.orig/xen/include/public/xen.h	2009-01-08 11:44:11.000000000 +0100
+++ 2009-01-08/xen/include/public/xen.h	2009-01-08 10:56:30.000000000 +0100
@@ -513,6 +513,7 @@ typedef struct shared_info shared_info_t
  *      a. relocated kernel image
  *      b. initial ram disk              [mod_start, mod_len]
  *      c. list of allocated page frames [mfn_list, nr_pages]
+ *         (unless relocated due to XEN_ELFNOTE_INIT_P2M)
  *      d. start_info_t structure        [register ESI (x86)]
  *      e. bootstrap page tables         [pt_base, CR3 (x86)]
  *      f. bootstrap stack               [register ESP (x86)]
@@ -554,6 +555,9 @@ struct start_info {
     unsigned long mod_start;    /* VIRTUAL address of pre-loaded module.  */
     unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     */
     int8_t cmd_line[MAX_GUEST_CMDLINE];
+    /* The pfn range here covers both page table and p->m table frames.   */
+    unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table.    */
+    unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table.  */
 };
 typedef struct start_info start_info_t;