# HG changeset patch # Parent 427c10f8e1e28d942886f89ebc79ffa93cb7fce9 xenpaging: use wait queues Use a wait queue to put a guest vcpu to sleep while the requested gfn is in paging state. This adds missing p2m_mem_paging_populate() calls to some callers of the new get_gfn* variants, which would crash now because they get an invalid mfn. It also fixes guest crashes due to unexpected returns from do_memory_op because copy_to/from_guest ran into a paged gfn. Now those places will always get a valid mfn. Since each gfn could be requested by several guest vcpus at the same time a queue of paged gfns is maintained. Each vcpu will be attached to that queue. Once p2m_mem_paging_resume restored the gfn the waiting vcpus will resume execution. There is untested code in p2m_mem_paging_init_queue() to allow cpu hotplug. Since each vcpu may wait on a different gfn there have to be as many queues as vcpus. But xl vcpu-set does not seem to work right now, so this code path cant be excercised right now. TODO: - use hash in p2m_mem_paging_queue_head - rename gfn_lock - use mm_lock_t for gfn_lock Signed-off-by: Olaf Hering --- xen/arch/x86/hvm/hvm.c | 2 xen/arch/x86/mm/p2m.c | 220 +++++++++++++++++++++++++++++++++------ xen/common/domctl.c | 3 xen/include/asm-x86/hvm/domain.h | 3 xen/include/asm-x86/p2m.h | 7 + 5 files changed, 205 insertions(+), 30 deletions(-) Index: xen-4.1.3-testing/xen/arch/x86/hvm/hvm.c =================================================================== --- xen-4.1.3-testing.orig/xen/arch/x86/hvm/hvm.c +++ xen-4.1.3-testing/xen/arch/x86/hvm/hvm.c @@ -475,6 +475,8 @@ int hvm_domain_initialise(struct domain spin_lock_init(&d->arch.hvm_domain.irq_lock); spin_lock_init(&d->arch.hvm_domain.uc_lock); + spin_lock_init(&d->arch.hvm_domain.gfn_lock); + INIT_LIST_HEAD(&d->arch.hvm_domain.msixtbl_list); spin_lock_init(&d->arch.hvm_domain.msixtbl_list_lock); Index: xen-4.1.3-testing/xen/arch/x86/mm/p2m.c =================================================================== --- xen-4.1.3-testing.orig/xen/arch/x86/mm/p2m.c +++ xen-4.1.3-testing/xen/arch/x86/mm/p2m.c @@ -30,6 +30,7 @@ #include #include /* ept_p2m_init() */ #include +#include #include #include #include @@ -2841,6 +2842,182 @@ set_shared_p2m_entry(struct p2m_domain * } #ifdef __x86_64__ +struct p2m_mem_paging_queue { + struct list_head list; + struct waitqueue_head wq; + unsigned long gfn; + unsigned short waiters; + unsigned short woken; + unsigned short index; +}; + +struct p2m_mem_paging_queue_head { + struct list_head list; + unsigned int max; +}; + +int p2m_mem_paging_init_queue(struct domain *d, unsigned int max) +{ + struct p2m_mem_paging_queue_head *h; + struct p2m_mem_paging_queue *q; + unsigned int i, nr; + int ret = 0; + + if (!is_hvm_domain(d)) + return 0; + + spin_lock(&d->arch.hvm_domain.gfn_lock); + + if (!d->arch.hvm_domain.gfn_queue) { + ret = -ENOMEM; + h = xzalloc(struct p2m_mem_paging_queue_head); + if (!h) { + domain_crash(d); + goto out; + } + + INIT_LIST_HEAD(&h->list); + nr = max; + } else { + h = d->arch.hvm_domain.gfn_queue; + if (max <= h->max) + goto out; + nr = max - h->max; + } + + ret = -ENOMEM; + q = xzalloc_array(struct p2m_mem_paging_queue, nr); + if (!q) { + if (!d->arch.hvm_domain.gfn_queue) + xfree(h); + domain_crash(d); + goto out; + } + + for (i = 0; i < nr; i++) { + init_waitqueue_head(&q[i].wq); + INIT_LIST_HEAD(&q[i].list); + q[i].index = h->max + i + 1; + list_add_tail(&q[i].list, &h->list); + } + + h->max = max; + d->arch.hvm_domain.gfn_queue = h; + ret = 0; + +out: + spin_unlock(&d->arch.hvm_domain.gfn_lock); + return ret; +} + +static struct p2m_mem_paging_queue *p2m_mem_paging_get_queue(struct domain *d, unsigned long gfn) +{ + struct p2m_mem_paging_queue_head *h; + struct p2m_mem_paging_queue *q, *q_match, *q_free; + + h = d->arch.hvm_domain.gfn_queue; + q_match = q_free = NULL; + + spin_lock(&d->arch.hvm_domain.gfn_lock); + + list_for_each_entry(q, &h->list, list) { + if (q->gfn == gfn) { + q_match = q; + break; + } + if (!q_free && !q->waiters) + q_free = q; + } + + if (!q_match && q_free) + q_match = q_free; + + if (q_match) { + if (q_match->woken) + printk("wq woken for gfn %u:%u %lx %u %u %u\n", current->domain->domain_id, current->vcpu_id, gfn, q_match->index, q_match->woken, q_match->waiters); + q_match->waiters++; + q_match->gfn = gfn; + } + + if (!q_match) + printk("No wq_get for gfn %u:%u %lx\n", current->domain->domain_id, current->vcpu_id, gfn); + + spin_unlock(&d->arch.hvm_domain.gfn_lock); + return q_match; +} + +static void p2m_mem_paging_put_queue(struct domain *d, struct p2m_mem_paging_queue *q_match) +{ + spin_lock(&d->arch.hvm_domain.gfn_lock); + + if (q_match->waiters == 0) + printk("wq_put no waiters, gfn %u:%u %lx %u\n", current->domain->domain_id, current->vcpu_id, q_match->gfn, q_match->woken); + else if (--q_match->waiters == 0) + q_match->gfn = q_match->woken = 0;; + + spin_unlock(&d->arch.hvm_domain.gfn_lock); +} + +static void p2m_mem_paging_wake_queue(struct domain *d, unsigned long gfn) +{ + struct p2m_mem_paging_queue_head *h; + struct p2m_mem_paging_queue *q, *q_match = NULL; + + spin_lock(&d->arch.hvm_domain.gfn_lock); + + h = d->arch.hvm_domain.gfn_queue; + list_for_each_entry(q, &h->list, list) { + if (q->gfn == gfn) { + q_match = q; + break; + } + } + if (q_match) { + if (q_match->woken || q_match->waiters == 0) + printk("Wrong wake for gfn %u:%u %p %lx %u %u\n", current->domain->domain_id, current->vcpu_id, q_match, gfn, q_match->woken, q_match->waiters); + q_match->woken++; + wake_up_all(&q_match->wq); + } + spin_unlock(&d->arch.hvm_domain.gfn_lock); +} + +/* Returns 0 if the gfn is still paged */ +static int p2m_mem_paging_get_entry(mfn_t *mfn, + struct p2m_domain *p2m, unsigned long gfn, + p2m_type_t *t, p2m_query_t q) +{ + p2m_access_t a = 0; + *mfn = p2m->get_entry(p2m, gfn, t, &a, q); + + return p2m_is_paging(*t) ? 0 : 1; +} + +/* Go to sleep in case of guest access */ +void p2m_mem_paging_wait(mfn_t *mfn, + struct p2m_domain *p2m, unsigned long gfn, + p2m_type_t *t, p2m_query_t q) +{ + struct p2m_mem_paging_queue *pmpq; + + /* Return p2mt as is in case of query */ + if ( q == p2m_query ) + return; + /* Foreign domains can not go to sleep */ + if ( current->domain != p2m->domain ) + return; + + pmpq = p2m_mem_paging_get_queue(p2m->domain, gfn); + if ( !pmpq ) + return; + + /* Populate the page once */ + if ( *t == p2m_ram_paging_out || *t == p2m_ram_paged ) + p2m_mem_paging_populate(p2m, gfn); + + wait_event(pmpq->wq, p2m_mem_paging_get_entry(mfn, p2m, gfn, t, q)); + p2m_mem_paging_put_queue(p2m->domain, pmpq); +} + /** * p2m_mem_paging_nominate - Mark a guest page as to-be-paged-out * @d: guest domain @@ -3022,21 +3199,17 @@ void p2m_mem_paging_drop_page(struct p2m */ void p2m_mem_paging_populate(struct p2m_domain *p2m, unsigned long gfn) { - struct vcpu *v = current; - mem_event_request_t req; + mem_event_request_t req = { .type = MEM_EVENT_TYPE_PAGING, .gfn = gfn }; p2m_type_t p2mt; p2m_access_t a; mfn_t mfn; - int restored = 0; struct domain *d = p2m->domain; + int put_request = 0; /* Check that there's space on the ring for this request */ if ( mem_event_claim_slot(d, &d->mem_event->paging) ) return; - memset(&req, 0, sizeof(req)); - req.type = MEM_EVENT_TYPE_PAGING; - /* Fix p2m mapping */ p2m_lock(p2m); mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, p2m_query); @@ -3045,35 +3218,23 @@ void p2m_mem_paging_populate(struct p2m_ { /* Restore page state if gfn was requested before evict */ if ( p2mt == p2m_ram_paging_out && mfn_valid(mfn) ) { + /* Restore gfn because it is needed by guest before evict */ set_p2m_entry(p2m, gfn, mfn, 0, p2m_ram_rw, a); - restored = 1; } else { set_p2m_entry(p2m, gfn, mfn, 0, p2m_ram_paging_in_start, a); + put_request = 1; } + /* Evict will fail now, the pager has to try another gfn */ + audit_p2m(p2m, 1); } p2m_unlock(p2m); - /* Pause domain if request came from guest and gfn has paging type */ - if ( !restored && p2m_is_paging(p2mt) && v->domain == d ) - { - vcpu_pause_nosync(v); - req.flags |= MEM_EVENT_FLAG_VCPU_PAUSED; - } - /* No need to inform pager if the gfn is not in the page-out path */ - else if ( restored || !p2m_do_populate(p2mt) ) - { - /* gfn is already on its way back and vcpu is not paused */ + /* One request per gfn, guest vcpus go to sleep, foreigners try again */ + if ( put_request ) + mem_event_put_request(d, &d->mem_event->paging, &req); + else mem_event_release_slot(d, &d->mem_event->paging); - return; - } - - /* Send request to pager */ - req.gfn = gfn; - req.p2mt = p2mt; - req.vcpu_id = v->vcpu_id; - - mem_event_put_request(d, &d->mem_event->paging, &req); } /** @@ -3200,12 +3361,11 @@ void p2m_mem_paging_resume(struct p2m_do p2m_unlock(p2m); } - /* Unpause domain */ - if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED ) - vcpu_unpause(d->vcpu[rsp.vcpu_id]); - /* Wake vcpus waiting for room in the ring */ mem_event_wake_requesters(&d->mem_event->paging); + + /* Unpause all vcpus that were paused because the gfn was paged */ + p2m_mem_paging_wake_queue(d, rsp.gfn); } void p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla, Index: xen-4.1.3-testing/xen/common/domctl.c =================================================================== --- xen-4.1.3-testing.orig/xen/common/domctl.c +++ xen-4.1.3-testing/xen/common/domctl.c @@ -536,6 +536,9 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc goto maxvcpu_out; } + if ( p2m_mem_paging_init_queue(d, max) ) + goto maxvcpu_out; + ret = 0; maxvcpu_out: Index: xen-4.1.3-testing/xen/include/asm-x86/hvm/domain.h =================================================================== --- xen-4.1.3-testing.orig/xen/include/asm-x86/hvm/domain.h +++ xen-4.1.3-testing/xen/include/asm-x86/hvm/domain.h @@ -87,6 +87,9 @@ struct hvm_domain { struct viridian_domain viridian; + spinlock_t gfn_lock; + struct p2m_mem_paging_queue_head *gfn_queue; + bool_t hap_enabled; bool_t mem_sharing_enabled; bool_t qemu_mapcache_invalidate; Index: xen-4.1.3-testing/xen/include/asm-x86/p2m.h =================================================================== --- xen-4.1.3-testing.orig/xen/include/asm-x86/p2m.h +++ xen-4.1.3-testing/xen/include/asm-x86/p2m.h @@ -345,6 +345,8 @@ gfn_to_mfn_type_p2m(struct p2m_domain *p } +extern void p2m_mem_paging_wait(mfn_t *mfn, struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, p2m_query_t q); + /* General conversion function from gfn to mfn */ static inline mfn_t _gfn_to_mfn_type(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, @@ -366,6 +368,9 @@ static inline mfn_t _gfn_to_mfn_type(str mfn = gfn_to_mfn_type_p2m(p2m, gfn, t, q); #ifdef __x86_64__ + if (unlikely(p2m_is_paging(*t)) ) + p2m_mem_paging_wait(&mfn, p2m, gfn, t, q); + if (unlikely((p2m_is_broken(*t)))) { /* Return invalid_mfn to avoid caller's access */ @@ -522,6 +527,8 @@ int clear_mmio_p2m_entry(struct p2m_doma /* Modify p2m table for shared gfn */ int set_shared_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn); +/* Initialize per-gfn wait queue */ +int p2m_mem_paging_init_queue(struct domain *d, unsigned int max); /* Check if a nominated gfn is valid to be paged out */ int p2m_mem_paging_nominate(struct p2m_domain *p2m, unsigned long gfn); /* Evict a frame */ @@ -535,6 +542,8 @@ int p2m_mem_paging_prep(struct p2m_domai /* Resume normal operation (in case a domain was paused) */ void p2m_mem_paging_resume(struct p2m_domain *p2m); #else +static inline int p2m_mem_paging_init_queue(struct domain *d, unsigned int max) +{ return 0; } static inline void p2m_mem_paging_drop_page(struct p2m_domain *p2m, unsigned long gfn) { } static inline void p2m_mem_paging_populate(struct p2m_domain *p2m, unsigned long gfn)