Accepting request 346076 from Virtualization
fate#315712: XEN: Use the PVOPS kernel (disable KMP building) OBS-URL: https://build.opensuse.org/request/show/346076 OBS-URL: https://build.opensuse.org/package/show/openSUSE:Factory/xen?expand=0&rev=213
This commit is contained in:
commit
32bf53107e
@ -1,49 +0,0 @@
|
||||
# Commit b7f74a19fe099e373ad52e4218c466f3e91b5f43
|
||||
# Date 2015-01-23 15:05:48 +0100
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
grant-table: use uint16_t consistently for grant copy offset and length
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
Acked-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/xen/common/grant_table.c
|
||||
+++ b/xen/common/grant_table.c
|
||||
@@ -1882,7 +1882,7 @@ static int
|
||||
__acquire_grant_for_copy(
|
||||
struct domain *rd, unsigned long gref, domid_t ldom, int readonly,
|
||||
unsigned long *frame, struct page_info **page,
|
||||
- unsigned *page_off, unsigned *length, unsigned allow_transitive)
|
||||
+ uint16_t *page_off, uint16_t *length, unsigned allow_transitive)
|
||||
{
|
||||
struct grant_table *rgt = rd->grant_table;
|
||||
grant_entry_v1_t *sha1;
|
||||
@@ -1895,8 +1895,8 @@ __acquire_grant_for_copy(
|
||||
grant_ref_t trans_gref;
|
||||
struct domain *td;
|
||||
unsigned long grant_frame;
|
||||
- unsigned trans_page_off;
|
||||
- unsigned trans_length;
|
||||
+ uint16_t trans_page_off;
|
||||
+ uint16_t trans_length;
|
||||
int is_sub_page;
|
||||
s16 rc = GNTST_okay;
|
||||
|
||||
@@ -2122,7 +2122,7 @@ __gnttab_copy(
|
||||
|
||||
if ( src_is_gref )
|
||||
{
|
||||
- unsigned source_off, source_len;
|
||||
+ uint16_t source_off, source_len;
|
||||
rc = __acquire_grant_for_copy(sd, op->source.u.ref,
|
||||
current->domain->domain_id, 1,
|
||||
&s_frame, &s_pg,
|
||||
@@ -2147,7 +2147,7 @@ __gnttab_copy(
|
||||
|
||||
if ( dest_is_gref )
|
||||
{
|
||||
- unsigned dest_off, dest_len;
|
||||
+ uint16_t dest_off, dest_len;
|
||||
rc = __acquire_grant_for_copy(dd, op->dest.u.ref,
|
||||
current->domain->domain_id, 0,
|
||||
&d_frame, &d_pg, &dest_off, &dest_len, 1);
|
@ -1,373 +0,0 @@
|
||||
# Commit 3c72f8c2cf19f735d813081c836f03e3078ee5c1
|
||||
# Date 2015-01-29 14:21:00 +0100
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
grant-table: refactor grant copy to reduce duplicate code
|
||||
|
||||
Much of the grant copy operation is identical for the source and
|
||||
destination buffers. Refactor the code into per-buffer functions.
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Tim Deegan <tim@xen.org>
|
||||
|
||||
--- a/xen/common/grant_table.c
|
||||
+++ b/xen/common/grant_table.c
|
||||
@@ -2077,139 +2077,230 @@ __acquire_grant_for_copy(
|
||||
return rc;
|
||||
}
|
||||
|
||||
-static void
|
||||
-__gnttab_copy(
|
||||
- struct gnttab_copy *op)
|
||||
-{
|
||||
- struct domain *sd = NULL, *dd = NULL;
|
||||
- unsigned long s_frame, d_frame;
|
||||
- struct page_info *s_pg = NULL, *d_pg = NULL;
|
||||
- char *sp, *dp;
|
||||
- s16 rc = GNTST_okay;
|
||||
- int have_d_grant = 0, have_s_grant = 0;
|
||||
- int src_is_gref, dest_is_gref;
|
||||
-
|
||||
- if ( ((op->source.offset + op->len) > PAGE_SIZE) ||
|
||||
- ((op->dest.offset + op->len) > PAGE_SIZE) )
|
||||
- PIN_FAIL(error_out, GNTST_bad_copy_arg, "copy beyond page area.\n");
|
||||
+struct gnttab_copy_buf {
|
||||
+ /* Guest provided. */
|
||||
+ struct gnttab_copy_ptr ptr;
|
||||
+ uint16_t len;
|
||||
+
|
||||
+ /* Mapped etc. */
|
||||
+ struct domain *domain;
|
||||
+ unsigned long frame;
|
||||
+ struct page_info *page;
|
||||
+ void *virt;
|
||||
+ bool_t read_only;
|
||||
+ bool_t have_grant;
|
||||
+ bool_t have_type;
|
||||
+};
|
||||
|
||||
- src_is_gref = op->flags & GNTCOPY_source_gref;
|
||||
- dest_is_gref = op->flags & GNTCOPY_dest_gref;
|
||||
+static int gnttab_copy_lock_domain(domid_t domid, unsigned int gref_flag,
|
||||
+ struct gnttab_copy_buf *buf)
|
||||
+{
|
||||
+ int rc;
|
||||
|
||||
- if ( (op->source.domid != DOMID_SELF && !src_is_gref ) ||
|
||||
- (op->dest.domid != DOMID_SELF && !dest_is_gref) )
|
||||
- PIN_FAIL(error_out, GNTST_permission_denied,
|
||||
+ if ( domid != DOMID_SELF && !gref_flag )
|
||||
+ PIN_FAIL(out, GNTST_permission_denied,
|
||||
"only allow copy-by-mfn for DOMID_SELF.\n");
|
||||
|
||||
- if ( op->source.domid == DOMID_SELF )
|
||||
- sd = rcu_lock_current_domain();
|
||||
- else if ( (sd = rcu_lock_domain_by_id(op->source.domid)) == NULL )
|
||||
- PIN_FAIL(error_out, GNTST_bad_domain,
|
||||
- "couldn't find %d\n", op->source.domid);
|
||||
-
|
||||
- if ( op->dest.domid == DOMID_SELF )
|
||||
- dd = rcu_lock_current_domain();
|
||||
- else if ( (dd = rcu_lock_domain_by_id(op->dest.domid)) == NULL )
|
||||
- PIN_FAIL(error_out, GNTST_bad_domain,
|
||||
- "couldn't find %d\n", op->dest.domid);
|
||||
+ if ( domid == DOMID_SELF )
|
||||
+ buf->domain = rcu_lock_current_domain();
|
||||
+ else
|
||||
+ {
|
||||
+ buf->domain = rcu_lock_domain_by_id(domid);
|
||||
+ if ( buf->domain == NULL )
|
||||
+ PIN_FAIL(out, GNTST_bad_domain, "couldn't find %d\n", domid);
|
||||
+ }
|
||||
|
||||
- rc = xsm_grant_copy(XSM_HOOK, sd, dd);
|
||||
- if ( rc )
|
||||
+ buf->ptr.domid = domid;
|
||||
+ rc = GNTST_okay;
|
||||
+ out:
|
||||
+ return rc;
|
||||
+}
|
||||
+
|
||||
+static void gnttab_copy_unlock_domains(struct gnttab_copy_buf *src,
|
||||
+ struct gnttab_copy_buf *dest)
|
||||
+{
|
||||
+ if ( src->domain )
|
||||
+ {
|
||||
+ rcu_unlock_domain(src->domain);
|
||||
+ src->domain = NULL;
|
||||
+ }
|
||||
+ if ( dest->domain )
|
||||
+ {
|
||||
+ rcu_unlock_domain(dest->domain);
|
||||
+ dest->domain = NULL;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static int gnttab_copy_lock_domains(const struct gnttab_copy *op,
|
||||
+ struct gnttab_copy_buf *src,
|
||||
+ struct gnttab_copy_buf *dest)
|
||||
+{
|
||||
+ int rc;
|
||||
+
|
||||
+ rc = gnttab_copy_lock_domain(op->source.domid,
|
||||
+ op->flags & GNTCOPY_source_gref, src);
|
||||
+ if ( rc < 0 )
|
||||
+ goto error;
|
||||
+ rc = gnttab_copy_lock_domain(op->dest.domid,
|
||||
+ op->flags & GNTCOPY_dest_gref, dest);
|
||||
+ if ( rc < 0 )
|
||||
+ goto error;
|
||||
+
|
||||
+ rc = xsm_grant_copy(XSM_HOOK, src->domain, dest->domain);
|
||||
+ if ( rc < 0 )
|
||||
{
|
||||
rc = GNTST_permission_denied;
|
||||
- goto error_out;
|
||||
+ goto error;
|
||||
}
|
||||
+ return 0;
|
||||
+
|
||||
+ error:
|
||||
+ gnttab_copy_unlock_domains(src, dest);
|
||||
+ return rc;
|
||||
+}
|
||||
|
||||
- if ( src_is_gref )
|
||||
+static void gnttab_copy_release_buf(struct gnttab_copy_buf *buf)
|
||||
+{
|
||||
+ if ( buf->virt )
|
||||
{
|
||||
- uint16_t source_off, source_len;
|
||||
- rc = __acquire_grant_for_copy(sd, op->source.u.ref,
|
||||
- current->domain->domain_id, 1,
|
||||
- &s_frame, &s_pg,
|
||||
- &source_off, &source_len, 1);
|
||||
- if ( rc != GNTST_okay )
|
||||
- goto error_out;
|
||||
- have_s_grant = 1;
|
||||
- if ( op->source.offset < source_off ||
|
||||
- op->len > source_len )
|
||||
- PIN_FAIL(error_out, GNTST_general_error,
|
||||
- "copy source out of bounds: %d < %d || %d > %d\n",
|
||||
- op->source.offset, source_off,
|
||||
- op->len, source_len);
|
||||
+ unmap_domain_page(buf->virt);
|
||||
+ buf->virt = NULL;
|
||||
}
|
||||
- else
|
||||
+ if ( buf->have_type )
|
||||
{
|
||||
- rc = __get_paged_frame(op->source.u.gmfn, &s_frame, &s_pg, 1, sd);
|
||||
- if ( rc != GNTST_okay )
|
||||
- PIN_FAIL(error_out, rc,
|
||||
- "source frame %lx invalid.\n", s_frame);
|
||||
+ put_page_type(buf->page);
|
||||
+ buf->have_type = 0;
|
||||
+ }
|
||||
+ if ( buf->page )
|
||||
+ {
|
||||
+ put_page(buf->page);
|
||||
+ buf->page = NULL;
|
||||
+ }
|
||||
+ if ( buf->have_grant )
|
||||
+ {
|
||||
+ __release_grant_for_copy(buf->domain, buf->ptr.u.ref, buf->read_only);
|
||||
+ buf->have_grant = 0;
|
||||
}
|
||||
+}
|
||||
+
|
||||
+static int gnttab_copy_claim_buf(const struct gnttab_copy *op,
|
||||
+ const struct gnttab_copy_ptr *ptr,
|
||||
+ struct gnttab_copy_buf *buf,
|
||||
+ unsigned int gref_flag)
|
||||
+{
|
||||
+ int rc;
|
||||
+
|
||||
+ buf->read_only = gref_flag == GNTCOPY_source_gref;
|
||||
|
||||
- if ( dest_is_gref )
|
||||
+ if ( op->flags & gref_flag )
|
||||
{
|
||||
- uint16_t dest_off, dest_len;
|
||||
- rc = __acquire_grant_for_copy(dd, op->dest.u.ref,
|
||||
- current->domain->domain_id, 0,
|
||||
- &d_frame, &d_pg, &dest_off, &dest_len, 1);
|
||||
+ rc = __acquire_grant_for_copy(buf->domain, ptr->u.ref,
|
||||
+ current->domain->domain_id,
|
||||
+ buf->read_only,
|
||||
+ &buf->frame, &buf->page,
|
||||
+ &buf->ptr.offset, &buf->len, 1);
|
||||
if ( rc != GNTST_okay )
|
||||
- goto error_out;
|
||||
- have_d_grant = 1;
|
||||
- if ( op->dest.offset < dest_off ||
|
||||
- op->len > dest_len )
|
||||
- PIN_FAIL(error_out, GNTST_general_error,
|
||||
- "copy dest out of bounds: %d < %d || %d > %d\n",
|
||||
- op->dest.offset, dest_off,
|
||||
- op->len, dest_len);
|
||||
+ goto out;
|
||||
+ buf->ptr.u.ref = ptr->u.ref;
|
||||
+ buf->have_grant = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
- rc = __get_paged_frame(op->dest.u.gmfn, &d_frame, &d_pg, 0, dd);
|
||||
+ rc = __get_paged_frame(ptr->u.gmfn, &buf->frame, &buf->page,
|
||||
+ buf->read_only, buf->domain);
|
||||
if ( rc != GNTST_okay )
|
||||
- PIN_FAIL(error_out, rc,
|
||||
- "destination frame %lx invalid.\n", d_frame);
|
||||
+ PIN_FAIL(out, rc,
|
||||
+ "source frame %lx invalid.\n", ptr->u.gmfn);
|
||||
+
|
||||
+ buf->ptr.u.gmfn = ptr->u.gmfn;
|
||||
+ buf->ptr.offset = 0;
|
||||
+ buf->len = PAGE_SIZE;
|
||||
}
|
||||
|
||||
- if ( !get_page_type(d_pg, PGT_writable_page) )
|
||||
+ if ( !buf->read_only )
|
||||
{
|
||||
- if ( !dd->is_dying )
|
||||
- gdprintk(XENLOG_WARNING, "Could not get dst frame %lx\n", d_frame);
|
||||
- rc = GNTST_general_error;
|
||||
- goto error_out;
|
||||
- }
|
||||
-
|
||||
- sp = map_domain_page(s_frame);
|
||||
- dp = map_domain_page(d_frame);
|
||||
-
|
||||
- memcpy(dp + op->dest.offset, sp + op->source.offset, op->len);
|
||||
-
|
||||
- unmap_domain_page(dp);
|
||||
- unmap_domain_page(sp);
|
||||
-
|
||||
- gnttab_mark_dirty(dd, d_frame);
|
||||
-
|
||||
- put_page_type(d_pg);
|
||||
- error_out:
|
||||
- if ( d_pg )
|
||||
- put_page(d_pg);
|
||||
- if ( s_pg )
|
||||
- put_page(s_pg);
|
||||
- if ( have_s_grant )
|
||||
- __release_grant_for_copy(sd, op->source.u.ref, 1);
|
||||
- if ( have_d_grant )
|
||||
- __release_grant_for_copy(dd, op->dest.u.ref, 0);
|
||||
- if ( sd )
|
||||
- rcu_unlock_domain(sd);
|
||||
- if ( dd )
|
||||
- rcu_unlock_domain(dd);
|
||||
- op->status = rc;
|
||||
+ if ( !get_page_type(buf->page, PGT_writable_page) )
|
||||
+ {
|
||||
+ if ( !buf->domain->is_dying )
|
||||
+ gdprintk(XENLOG_WARNING, "Could not get writable frame %lx\n", buf->frame);
|
||||
+ rc = GNTST_general_error;
|
||||
+ goto out;
|
||||
+ }
|
||||
+ buf->have_type = 1;
|
||||
+ }
|
||||
+
|
||||
+ buf->virt = map_domain_page(buf->frame);
|
||||
+ rc = GNTST_okay;
|
||||
+
|
||||
+ out:
|
||||
+ return rc;
|
||||
}
|
||||
|
||||
-static long
|
||||
-gnttab_copy(
|
||||
+static int gnttab_copy_buf(const struct gnttab_copy *op,
|
||||
+ struct gnttab_copy_buf *dest,
|
||||
+ const struct gnttab_copy_buf *src)
|
||||
+{
|
||||
+ int rc;
|
||||
+
|
||||
+ if ( ((op->source.offset + op->len) > PAGE_SIZE) ||
|
||||
+ ((op->dest.offset + op->len) > PAGE_SIZE) )
|
||||
+ PIN_FAIL(out, GNTST_bad_copy_arg, "copy beyond page area.\n");
|
||||
+
|
||||
+ if ( op->source.offset < src->ptr.offset ||
|
||||
+ op->source.offset + op->len > src->ptr.offset + src->len )
|
||||
+ PIN_FAIL(out, GNTST_general_error,
|
||||
+ "copy source out of bounds: %d < %d || %d > %d\n",
|
||||
+ op->source.offset, src->ptr.offset,
|
||||
+ op->len, src->len);
|
||||
+
|
||||
+ if ( op->dest.offset < dest->ptr.offset ||
|
||||
+ op->dest.offset + op->len > dest->ptr.offset + dest->len )
|
||||
+ PIN_FAIL(out, GNTST_general_error,
|
||||
+ "copy dest out of bounds: %d < %d || %d > %d\n",
|
||||
+ op->dest.offset, dest->ptr.offset,
|
||||
+ op->len, dest->len);
|
||||
+
|
||||
+ memcpy(dest->virt + op->dest.offset, src->virt + op->source.offset,
|
||||
+ op->len);
|
||||
+ gnttab_mark_dirty(dest->domain, dest->frame);
|
||||
+ rc = GNTST_okay;
|
||||
+ out:
|
||||
+ return rc;
|
||||
+}
|
||||
+
|
||||
+static int gnttab_copy_one(const struct gnttab_copy *op,
|
||||
+ struct gnttab_copy_buf *dest,
|
||||
+ struct gnttab_copy_buf *src)
|
||||
+{
|
||||
+ int rc;
|
||||
+
|
||||
+ rc = gnttab_copy_lock_domains(op, src, dest);
|
||||
+ if ( rc < 0 )
|
||||
+ goto out;
|
||||
+
|
||||
+ rc = gnttab_copy_claim_buf(op, &op->source, src, GNTCOPY_source_gref);
|
||||
+ if ( rc < 0 )
|
||||
+ goto out;
|
||||
+
|
||||
+ rc = gnttab_copy_claim_buf(op, &op->dest, dest, GNTCOPY_dest_gref);
|
||||
+ if ( rc < 0 )
|
||||
+ goto out;
|
||||
+
|
||||
+ rc = gnttab_copy_buf(op, dest, src);
|
||||
+ out:
|
||||
+ gnttab_copy_release_buf(src);
|
||||
+ gnttab_copy_release_buf(dest);
|
||||
+ gnttab_copy_unlock_domains(src, dest);
|
||||
+ return rc;
|
||||
+}
|
||||
+
|
||||
+static long gnttab_copy(
|
||||
XEN_GUEST_HANDLE_PARAM(gnttab_copy_t) uop, unsigned int count)
|
||||
{
|
||||
- int i;
|
||||
+ unsigned int i;
|
||||
struct gnttab_copy op;
|
||||
+ struct gnttab_copy_buf src = {};
|
||||
+ struct gnttab_copy_buf dest = {};
|
||||
|
||||
for ( i = 0; i < count; i++ )
|
||||
{
|
||||
@@ -2217,7 +2308,9 @@ gnttab_copy(
|
||||
return i;
|
||||
if ( unlikely(__copy_from_guest(&op, uop, 1)) )
|
||||
return -EFAULT;
|
||||
- __gnttab_copy(&op);
|
||||
+
|
||||
+ op.status = gnttab_copy_one(&op, &dest, &src);
|
||||
+
|
||||
if ( unlikely(__copy_field_to_guest(uop, &op, status)) )
|
||||
return -EFAULT;
|
||||
guest_handle_add_offset(uop, 1);
|
||||
--- a/xen/include/public/grant_table.h
|
||||
+++ b/xen/include/public/grant_table.h
|
||||
@@ -453,7 +453,7 @@ DEFINE_XEN_GUEST_HANDLE(gnttab_transfer_
|
||||
|
||||
struct gnttab_copy {
|
||||
/* IN parameters. */
|
||||
- struct {
|
||||
+ struct gnttab_copy_ptr {
|
||||
union {
|
||||
grant_ref_t ref;
|
||||
xen_pfn_t gmfn;
|
@ -1,155 +0,0 @@
|
||||
# Commit d28f42f2703e483116bafd2b0b76a32af67d83ad
|
||||
# Date 2015-01-29 14:22:22 +0100
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
grant-table: defer releasing pages acquired in a grant copy
|
||||
|
||||
Acquiring a page for the source or destination of a grant copy is an
|
||||
expensive operation. A common use case is for two adjacent grant copy
|
||||
ops to operate on either the same source or the same destination page.
|
||||
|
||||
Instead of always acquiring and releasing destination and source pages
|
||||
for each operation, release the page once it is no longer valid for
|
||||
the next op.
|
||||
|
||||
If either the source or destination domains changes both pages are
|
||||
released as it is unlikely that either will still be valid.
|
||||
|
||||
XenServer's performance benchmarks show modest improvements in network
|
||||
receive throughput (netback uses grant copy in the guest Rx path) and
|
||||
no regressions in disk performance (using tapdisk3 which grant copies
|
||||
as the backend).
|
||||
|
||||
Baseline Deferred Release
|
||||
Interhost receive to VM 7.2 Gb/s ~9 Gbit/s
|
||||
Interhost aggregate 24 Gb/s 28 Gb/s
|
||||
Intrahost single stream 14 Gb/s 14 Gb/s
|
||||
Intrahost aggregate 34 Gb/s 36 Gb/s
|
||||
Aggregate disk write 900 MB/s 900 MB/s
|
||||
Aggregate disk read 890 MB/s 890 MB/s
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
Reviewed-by: Tim Deegan <tim@xen.org>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/xen/common/grant_table.c
|
||||
+++ b/xen/common/grant_table.c
|
||||
@@ -2236,6 +2236,17 @@ static int gnttab_copy_claim_buf(const s
|
||||
return rc;
|
||||
}
|
||||
|
||||
+static bool_t gnttab_copy_buf_valid(const struct gnttab_copy_ptr *p,
|
||||
+ const struct gnttab_copy_buf *b,
|
||||
+ bool_t has_gref)
|
||||
+{
|
||||
+ if ( !b->virt )
|
||||
+ return 0;
|
||||
+ if ( has_gref )
|
||||
+ return b->have_grant && p->u.ref == b->ptr.u.ref;
|
||||
+ return p->u.gmfn == b->ptr.u.gmfn;
|
||||
+}
|
||||
+
|
||||
static int gnttab_copy_buf(const struct gnttab_copy *op,
|
||||
struct gnttab_copy_buf *dest,
|
||||
const struct gnttab_copy_buf *src)
|
||||
@@ -2274,23 +2285,40 @@ static int gnttab_copy_one(const struct
|
||||
{
|
||||
int rc;
|
||||
|
||||
- rc = gnttab_copy_lock_domains(op, src, dest);
|
||||
- if ( rc < 0 )
|
||||
- goto out;
|
||||
+ if ( !src->domain || op->source.domid != src->ptr.domid ||
|
||||
+ !dest->domain || op->dest.domid != dest->ptr.domid )
|
||||
+ {
|
||||
+ gnttab_copy_release_buf(src);
|
||||
+ gnttab_copy_release_buf(dest);
|
||||
+ gnttab_copy_unlock_domains(src, dest);
|
||||
|
||||
- rc = gnttab_copy_claim_buf(op, &op->source, src, GNTCOPY_source_gref);
|
||||
- if ( rc < 0 )
|
||||
- goto out;
|
||||
+ rc = gnttab_copy_lock_domains(op, src, dest);
|
||||
+ if ( rc < 0 )
|
||||
+ goto out;
|
||||
+ }
|
||||
|
||||
- rc = gnttab_copy_claim_buf(op, &op->dest, dest, GNTCOPY_dest_gref);
|
||||
- if ( rc < 0 )
|
||||
- goto out;
|
||||
+ /* Different source? */
|
||||
+ if ( !gnttab_copy_buf_valid(&op->source, src,
|
||||
+ op->flags & GNTCOPY_source_gref) )
|
||||
+ {
|
||||
+ gnttab_copy_release_buf(src);
|
||||
+ rc = gnttab_copy_claim_buf(op, &op->source, src, GNTCOPY_source_gref);
|
||||
+ if ( rc < 0 )
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ /* Different dest? */
|
||||
+ if ( !gnttab_copy_buf_valid(&op->dest, dest,
|
||||
+ op->flags & GNTCOPY_dest_gref) )
|
||||
+ {
|
||||
+ gnttab_copy_release_buf(dest);
|
||||
+ rc = gnttab_copy_claim_buf(op, &op->dest, dest, GNTCOPY_dest_gref);
|
||||
+ if ( rc < 0 )
|
||||
+ goto out;
|
||||
+ }
|
||||
|
||||
rc = gnttab_copy_buf(op, dest, src);
|
||||
out:
|
||||
- gnttab_copy_release_buf(src);
|
||||
- gnttab_copy_release_buf(dest);
|
||||
- gnttab_copy_unlock_domains(src, dest);
|
||||
return rc;
|
||||
}
|
||||
|
||||
@@ -2301,21 +2329,42 @@ static long gnttab_copy(
|
||||
struct gnttab_copy op;
|
||||
struct gnttab_copy_buf src = {};
|
||||
struct gnttab_copy_buf dest = {};
|
||||
+ long rc = 0;
|
||||
|
||||
for ( i = 0; i < count; i++ )
|
||||
{
|
||||
- if (i && hypercall_preempt_check())
|
||||
- return i;
|
||||
+ if ( i && hypercall_preempt_check() )
|
||||
+ {
|
||||
+ rc = i;
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
if ( unlikely(__copy_from_guest(&op, uop, 1)) )
|
||||
- return -EFAULT;
|
||||
+ {
|
||||
+ rc = -EFAULT;
|
||||
+ break;
|
||||
+ }
|
||||
|
||||
op.status = gnttab_copy_one(&op, &dest, &src);
|
||||
+ if ( op.status != GNTST_okay )
|
||||
+ {
|
||||
+ gnttab_copy_release_buf(&src);
|
||||
+ gnttab_copy_release_buf(&dest);
|
||||
+ }
|
||||
|
||||
if ( unlikely(__copy_field_to_guest(uop, &op, status)) )
|
||||
- return -EFAULT;
|
||||
+ {
|
||||
+ rc = -EFAULT;
|
||||
+ break;
|
||||
+ }
|
||||
guest_handle_add_offset(uop, 1);
|
||||
}
|
||||
- return 0;
|
||||
+
|
||||
+ gnttab_copy_release_buf(&src);
|
||||
+ gnttab_copy_release_buf(&dest);
|
||||
+ gnttab_copy_unlock_domains(&src, &dest);
|
||||
+
|
||||
+ return rc;
|
||||
}
|
||||
|
||||
static long
|
@ -1,90 +0,0 @@
|
||||
References: bsc#949138
|
||||
|
||||
Subject: libxl: make some _dispose functions idempotent and tolerate NULL
|
||||
From: Wei Liu wei.liu2@citrix.com Wed Feb 25 14:56:02 2015 +0000
|
||||
Date: Mon Mar 2 17:05:35 2015 +0000:
|
||||
Git: 1ea68f1a82ef94b3cc644fa70307c5151f356baf
|
||||
|
||||
These functions are not generated, so we need to do it by hand.
|
||||
|
||||
Functions list:
|
||||
libxl_bitmap_dispose
|
||||
libxl_string_list_dispose
|
||||
libxl_key_value_list_dipose
|
||||
libxl_cpuid_dispose
|
||||
|
||||
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
|
||||
Cc: Ian Campbell <ian.campbell@citrix.com>
|
||||
Cc: Ian Jackson <ian.jackson@eu.citrix.com>
|
||||
Acked-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
|
||||
Index: xen-4.5.1-testing/tools/libxl/libxl.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/libxl/libxl.c
|
||||
+++ xen-4.5.1-testing/tools/libxl/libxl.c
|
||||
@@ -211,9 +211,12 @@ void libxl_string_list_dispose(libxl_str
|
||||
if (!sl)
|
||||
return;
|
||||
|
||||
- for (i = 0; sl[i] != NULL; i++)
|
||||
+ for (i = 0; sl[i] != NULL; i++) {
|
||||
free(sl[i]);
|
||||
+ sl[i] = NULL;
|
||||
+ }
|
||||
free(sl);
|
||||
+ *psl = NULL;
|
||||
}
|
||||
|
||||
void libxl_string_list_copy(libxl_ctx *ctx,
|
||||
@@ -273,10 +276,14 @@ void libxl_key_value_list_dispose(libxl_
|
||||
|
||||
for (i = 0; kvl[i] != NULL; i += 2) {
|
||||
free(kvl[i]);
|
||||
- if (kvl[i + 1])
|
||||
+ kvl[i] = NULL;
|
||||
+ if (kvl[i + 1]) {
|
||||
free(kvl[i + 1]);
|
||||
+ kvl[i+1] = NULL;
|
||||
+ }
|
||||
}
|
||||
free(kvl);
|
||||
+ *pkvl = NULL;
|
||||
}
|
||||
|
||||
void libxl_key_value_list_copy(libxl_ctx *ctx,
|
||||
Index: xen-4.5.1-testing/tools/libxl/libxl_cpuid.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/libxl/libxl_cpuid.c
|
||||
+++ xen-4.5.1-testing/tools/libxl/libxl_cpuid.c
|
||||
@@ -28,10 +28,13 @@ void libxl_cpuid_dispose(libxl_cpuid_pol
|
||||
return;
|
||||
for (i = 0; cpuid_list[i].input[0] != XEN_CPUID_INPUT_UNUSED; i++) {
|
||||
for (j = 0; j < 4; j++)
|
||||
- if (cpuid_list[i].policy[j] != NULL)
|
||||
+ if (cpuid_list[i].policy[j] != NULL) {
|
||||
free(cpuid_list[i].policy[j]);
|
||||
+ cpuid_list[i].policy[j] = NULL;
|
||||
+ }
|
||||
}
|
||||
free(cpuid_list);
|
||||
+ *p_cpuid_list = NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
Index: xen-4.5.1-testing/tools/libxl/libxl_utils.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/libxl/libxl_utils.c
|
||||
+++ xen-4.5.1-testing/tools/libxl/libxl_utils.c
|
||||
@@ -604,7 +604,12 @@ void libxl_bitmap_init(libxl_bitmap *map
|
||||
|
||||
void libxl_bitmap_dispose(libxl_bitmap *map)
|
||||
{
|
||||
+ if (!map)
|
||||
+ return;
|
||||
+
|
||||
free(map->map);
|
||||
+ map->map = NULL;
|
||||
+ map->size = 0;
|
||||
}
|
||||
|
||||
void libxl_bitmap_copy(libxl_ctx *ctx, libxl_bitmap *dptr,
|
@ -1,113 +0,0 @@
|
||||
# Commit 88a2372c6ba44dd42b915a95a823cf9d4d260e25
|
||||
# Date 2015-03-23 16:49:42 +0100
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
vm-assist: prepare for discontiguous used bit numbers
|
||||
|
||||
Since the a flag will get assigned a value discontiguous to the
|
||||
existing ones (in order to preserve the low bits, as only those are
|
||||
currently accessible to 32-bit guests), this requires a little bit of
|
||||
rework of the VM assist code in general: An architecture specific
|
||||
VM_ASSIST_VALID definition gets introduced (with an optional compat
|
||||
mode counterpart), and compilation of the respective code becomes
|
||||
conditional upon this being defined (ARM doesn't wire these up and
|
||||
hence doesn't need that code).
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Tim Deegan <tim@xen.org>
|
||||
|
||||
--- a/xen/common/compat/kernel.c
|
||||
+++ b/xen/common/compat/kernel.c
|
||||
@@ -41,6 +41,11 @@ CHECK_TYPE(domain_handle);
|
||||
#define xennmi_callback compat_nmi_callback
|
||||
#define xennmi_callback_t compat_nmi_callback_t
|
||||
|
||||
+#ifdef COMPAT_VM_ASSIST_VALID
|
||||
+#undef VM_ASSIST_VALID
|
||||
+#define VM_ASSIST_VALID COMPAT_VM_ASSIST_VALID
|
||||
+#endif
|
||||
+
|
||||
#define DO(fn) int compat_##fn
|
||||
#define COMPAT
|
||||
|
||||
--- a/xen/common/domain.c
|
||||
+++ b/xen/common/domain.c
|
||||
@@ -1325,9 +1325,11 @@ long do_vcpu_op(int cmd, int vcpuid, XEN
|
||||
return rc;
|
||||
}
|
||||
|
||||
-long vm_assist(struct domain *p, unsigned int cmd, unsigned int type)
|
||||
+#ifdef VM_ASSIST_VALID
|
||||
+long vm_assist(struct domain *p, unsigned int cmd, unsigned int type,
|
||||
+ unsigned long valid)
|
||||
{
|
||||
- if ( type > MAX_VMASST_TYPE )
|
||||
+ if ( type >= BITS_PER_LONG || !test_bit(type, &valid) )
|
||||
return -EINVAL;
|
||||
|
||||
switch ( cmd )
|
||||
@@ -1342,6 +1344,7 @@ long vm_assist(struct domain *p, unsigne
|
||||
|
||||
return -ENOSYS;
|
||||
}
|
||||
+#endif
|
||||
|
||||
struct pirq *pirq_get_info(struct domain *d, int pirq)
|
||||
{
|
||||
--- a/xen/common/kernel.c
|
||||
+++ b/xen/common/kernel.c
|
||||
@@ -396,10 +396,12 @@ DO(nmi_op)(unsigned int cmd, XEN_GUEST_H
|
||||
return rc;
|
||||
}
|
||||
|
||||
+#ifdef VM_ASSIST_VALID
|
||||
DO(vm_assist)(unsigned int cmd, unsigned int type)
|
||||
{
|
||||
- return vm_assist(current->domain, cmd, type);
|
||||
+ return vm_assist(current->domain, cmd, type, VM_ASSIST_VALID);
|
||||
}
|
||||
+#endif
|
||||
|
||||
DO(ni_hypercall)(void)
|
||||
{
|
||||
--- a/xen/include/asm-x86/config.h
|
||||
+++ b/xen/include/asm-x86/config.h
|
||||
@@ -327,6 +327,14 @@ extern unsigned long xen_phys_start;
|
||||
#define ARG_XLAT_START(v) \
|
||||
(ARG_XLAT_VIRT_START + ((v)->vcpu_id << ARG_XLAT_VA_SHIFT))
|
||||
|
||||
+#define NATIVE_VM_ASSIST_VALID ((1UL << VMASST_TYPE_4gb_segments) | \
|
||||
+ (1UL << VMASST_TYPE_4gb_segments_notify) | \
|
||||
+ (1UL << VMASST_TYPE_writable_pagetables) | \
|
||||
+ (1UL << VMASST_TYPE_pae_extended_cr3))
|
||||
+#define VM_ASSIST_VALID NATIVE_VM_ASSIST_VALID
|
||||
+#define COMPAT_VM_ASSIST_VALID (NATIVE_VM_ASSIST_VALID & \
|
||||
+ ((1UL << COMPAT_BITS_PER_LONG) - 1))
|
||||
+
|
||||
#define ELFSIZE 64
|
||||
|
||||
#define ARCH_CRASH_SAVE_VMCOREINFO
|
||||
--- a/xen/include/public/xen.h
|
||||
+++ b/xen/include/public/xen.h
|
||||
@@ -486,7 +486,9 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
|
||||
/* x86/PAE guests: support PDPTs above 4GB. */
|
||||
#define VMASST_TYPE_pae_extended_cr3 3
|
||||
|
||||
+#if __XEN_INTERFACE_VERSION__ < 0x00040600
|
||||
#define MAX_VMASST_TYPE 3
|
||||
+#endif
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
||||
--- a/xen/include/xen/lib.h
|
||||
+++ b/xen/include/xen/lib.h
|
||||
@@ -92,7 +92,8 @@ extern void guest_printk(const struct do
|
||||
__attribute__ ((format (printf, 2, 3)));
|
||||
extern void noreturn panic(const char *format, ...)
|
||||
__attribute__ ((format (printf, 1, 2)));
|
||||
-extern long vm_assist(struct domain *, unsigned int, unsigned int);
|
||||
+extern long vm_assist(struct domain *, unsigned int cmd, unsigned int type,
|
||||
+ unsigned long valid);
|
||||
extern int __printk_ratelimit(int ratelimit_ms, int ratelimit_burst);
|
||||
extern int printk_ratelimit(void);
|
||||
|
@ -1,609 +0,0 @@
|
||||
Index: xen-4.5.1-testing/tools/libxl/libxl_dm.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/libxl/libxl_dm.c
|
||||
+++ xen-4.5.1-testing/tools/libxl/libxl_dm.c
|
||||
@@ -445,6 +445,15 @@ static char ** libxl__build_device_model
|
||||
flexarray_append(dm_args, "-mon");
|
||||
flexarray_append(dm_args, "chardev=libxl-cmd,mode=control");
|
||||
|
||||
+ flexarray_append(dm_args, "-chardev");
|
||||
+ flexarray_append(dm_args,
|
||||
+ libxl__sprintf(gc, "socket,id=libxenstat-cmd,"
|
||||
+ "path=%s/qmp-libxenstat-%d,server,nowait",
|
||||
+ libxl__run_dir_path(), guest_domid));
|
||||
+
|
||||
+ flexarray_append(dm_args, "-mon");
|
||||
+ flexarray_append(dm_args, "chardev=libxenstat-cmd,mode=control");
|
||||
+
|
||||
for (i = 0; i < guest_config->num_channels; i++) {
|
||||
connection = guest_config->channels[i].connection;
|
||||
devid = guest_config->channels[i].devid;
|
||||
Index: xen-4.5.1-testing/tools/libxl/libxl_qmp.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/libxl/libxl_qmp.c
|
||||
+++ xen-4.5.1-testing/tools/libxl/libxl_qmp.c
|
||||
@@ -723,6 +723,13 @@ void libxl__qmp_cleanup(libxl__gc *gc, u
|
||||
LOGE(ERROR, "Failed to remove QMP socket file %s", qmp_socket);
|
||||
}
|
||||
}
|
||||
+
|
||||
+ qmp_socket = GCSPRINTF("%s/qmp-libxenstat-%d", libxl__run_dir_path(), domid);
|
||||
+ if (unlink(qmp_socket) == -1) {
|
||||
+ if (errno != ENOENT) {
|
||||
+ LOGE(ERROR, "Failed to remove QMP socket file %s", qmp_socket);
|
||||
+ }
|
||||
+ }
|
||||
}
|
||||
|
||||
int libxl__qmp_query_serial(libxl__qmp_handler *qmp)
|
||||
Index: xen-4.5.1-testing/tools/xenstat/libxenstat/Makefile
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/xenstat/libxenstat/Makefile
|
||||
+++ xen-4.5.1-testing/tools/xenstat/libxenstat/Makefile
|
||||
@@ -24,7 +24,7 @@ MINOR=0
|
||||
LIB=src/libxenstat.a
|
||||
SHLIB=src/libxenstat.so.$(MAJOR).$(MINOR)
|
||||
SHLIB_LINKS=src/libxenstat.so.$(MAJOR) src/libxenstat.so
|
||||
-OBJECTS-y=src/xenstat.o
|
||||
+OBJECTS-y=src/xenstat.o src/xenstat_qmp.o
|
||||
OBJECTS-$(CONFIG_Linux) += src/xenstat_linux.o
|
||||
OBJECTS-$(CONFIG_SunOS) += src/xenstat_solaris.o
|
||||
OBJECTS-$(CONFIG_NetBSD) += src/xenstat_netbsd.o
|
||||
@@ -32,7 +32,7 @@ OBJECTS-$(CONFIG_FreeBSD) += src/xenstat
|
||||
SONAME_FLAGS=-Wl,$(SONAME_LDFLAG) -Wl,libxenstat.so.$(MAJOR)
|
||||
|
||||
CFLAGS+=-fPIC
|
||||
-CFLAGS+=-Isrc $(CFLAGS_libxenctrl) $(CFLAGS_libxenstore) $(CFLAGS_xeninclude)
|
||||
+CFLAGS+=-Isrc $(CFLAGS_libxenctrl) $(CFLAGS_libxenstore) $(CFLAGS_xeninclude) -include $(XEN_ROOT)/tools/config.h
|
||||
|
||||
LDLIBS-y = $(LDLIBS_libxenstore) $(LDLIBS_libxenctrl)
|
||||
LDLIBS-$(CONFIG_SunOS) += -lkstat
|
||||
Index: xen-4.5.1-testing/tools/xenstat/xentop/Makefile
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/xenstat/xentop/Makefile
|
||||
+++ xen-4.5.1-testing/tools/xenstat/xentop/Makefile
|
||||
@@ -19,7 +19,7 @@ all install xentop:
|
||||
else
|
||||
|
||||
CFLAGS += -DGCC_PRINTF -Werror $(CFLAGS_libxenstat)
|
||||
-LDLIBS += $(LDLIBS_libxenstat) $(CURSES_LIBS) $(SOCKET_LIBS) -lm
|
||||
+LDLIBS += $(LDLIBS_libxenstat) $(CURSES_LIBS) $(SOCKET_LIBS) -lm -lyajl
|
||||
CFLAGS += -DHOST_$(XEN_OS)
|
||||
|
||||
# Include configure output (config.h) to headers search path
|
||||
Index: xen-4.5.1-testing/tools/xenstat/libxenstat/src/xenstat_priv.h
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/xenstat/libxenstat/src/xenstat_priv.h
|
||||
+++ xen-4.5.1-testing/tools/xenstat/libxenstat/src/xenstat_priv.h
|
||||
@@ -109,5 +109,7 @@ extern int xenstat_collect_networks(xens
|
||||
extern void xenstat_uninit_networks(xenstat_handle * handle);
|
||||
extern int xenstat_collect_vbds(xenstat_node * node);
|
||||
extern void xenstat_uninit_vbds(xenstat_handle * handle);
|
||||
+extern void read_attributes_qdisk(xenstat_node * node);
|
||||
+extern xenstat_vbd *xenstat_save_vbd(xenstat_domain * domain, xenstat_vbd * vbd);
|
||||
|
||||
#endif /* XENSTAT_PRIV_H */
|
||||
Index: xen-4.5.1-testing/tools/xenstat/libxenstat/src/xenstat.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/xenstat/libxenstat/src/xenstat.c
|
||||
+++ xen-4.5.1-testing/tools/xenstat/libxenstat/src/xenstat.c
|
||||
@@ -657,6 +657,27 @@ static void xenstat_uninit_xen_version(x
|
||||
* VBD functions
|
||||
*/
|
||||
|
||||
+/* Save VBD information */
|
||||
+xenstat_vbd *xenstat_save_vbd(xenstat_domain *domain, xenstat_vbd *vbd)
|
||||
+{
|
||||
+ xenstat_vbd *vbds = domain->vbds;
|
||||
+
|
||||
+ domain->num_vbds++;
|
||||
+ domain->vbds = realloc(domain->vbds,
|
||||
+ domain->num_vbds *
|
||||
+ sizeof(xenstat_vbd));
|
||||
+
|
||||
+ if (domain->vbds == NULL) {
|
||||
+ domain->num_vbds = 0;
|
||||
+ free(vbds);
|
||||
+ }
|
||||
+ else {
|
||||
+ domain->vbds[domain->num_vbds - 1] = *vbd;
|
||||
+ }
|
||||
+
|
||||
+ return domain->vbds;
|
||||
+}
|
||||
+
|
||||
/* Free VBD information */
|
||||
static void xenstat_free_vbds(xenstat_node * node)
|
||||
{
|
||||
Index: xen-4.5.1-testing/tools/xenstat/libxenstat/src/xenstat_linux.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/xenstat/libxenstat/src/xenstat_linux.c
|
||||
+++ xen-4.5.1-testing/tools/xenstat/libxenstat/src/xenstat_linux.c
|
||||
@@ -417,6 +417,9 @@ int xenstat_collect_vbds(xenstat_node *
|
||||
}
|
||||
}
|
||||
|
||||
+ /* Get qdisk statistics */
|
||||
+ read_attributes_qdisk(node);
|
||||
+
|
||||
rewinddir(priv->sysfsvbd);
|
||||
|
||||
for(dp = readdir(priv->sysfsvbd); dp != NULL ;
|
||||
@@ -477,18 +480,10 @@ int xenstat_collect_vbds(xenstat_node *
|
||||
continue;
|
||||
}
|
||||
|
||||
- if (domain->vbds == NULL) {
|
||||
- domain->num_vbds = 1;
|
||||
- domain->vbds = malloc(sizeof(xenstat_vbd));
|
||||
- } else {
|
||||
- domain->num_vbds++;
|
||||
- domain->vbds = realloc(domain->vbds,
|
||||
- domain->num_vbds *
|
||||
- sizeof(xenstat_vbd));
|
||||
- }
|
||||
- if (domain->vbds == NULL)
|
||||
+ if ((xenstat_save_vbd(domain, &vbd)) == NULL) {
|
||||
+ perror("Allocation error");
|
||||
return 0;
|
||||
- domain->vbds[domain->num_vbds - 1] = vbd;
|
||||
+ }
|
||||
}
|
||||
|
||||
return 1;
|
||||
Index: xen-4.5.1-testing/tools/xenstat/libxenstat/src/xenstat_qmp.c
|
||||
===================================================================
|
||||
--- /dev/null
|
||||
+++ xen-4.5.1-testing/tools/xenstat/libxenstat/src/xenstat_qmp.c
|
||||
@@ -0,0 +1,451 @@
|
||||
+/* libxenstat: statistics-collection library for Xen
|
||||
+ *
|
||||
+ * This library is free software; you can redistribute it and/or
|
||||
+ * modify it under the terms of the GNU Lesser General Public
|
||||
+ * License as published by the Free Software Foundation; either
|
||||
+ * version 2.1 of the License, or (at your option) any later version.
|
||||
+ *
|
||||
+ * This library is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ * Lesser General Public License for more details.
|
||||
+ */
|
||||
+
|
||||
+#include <fcntl.h>
|
||||
+#include <sys/types.h>
|
||||
+#include <sys/socket.h>
|
||||
+#include <sys/poll.h>
|
||||
+#include <sys/un.h>
|
||||
+#include <stdlib.h>
|
||||
+#include <string.h>
|
||||
+#include <unistd.h>
|
||||
+
|
||||
+#include <xenctrl.h>
|
||||
+
|
||||
+#include "xenstat_priv.h"
|
||||
+
|
||||
+#ifdef HAVE_YAJL_YAJL_VERSION_H
|
||||
+# include <yajl/yajl_version.h>
|
||||
+#endif
|
||||
+
|
||||
+/* YAJL version check */
|
||||
+#if defined(YAJL_MAJOR) && (YAJL_MAJOR > 1)
|
||||
+# define HAVE_YAJL_V2 1
|
||||
+#endif
|
||||
+
|
||||
+#ifdef HAVE_YAJL_V2
|
||||
+
|
||||
+#include <yajl/yajl_tree.h>
|
||||
+
|
||||
+static unsigned char *qmp_query(int, char *);
|
||||
+
|
||||
+enum query_blockstats {
|
||||
+ QMP_STATS_RETURN = 0,
|
||||
+ QMP_STATS_DEVICE = 1,
|
||||
+ QMP_STATS = 2,
|
||||
+ QMP_RD_BYTES = 3,
|
||||
+ QMP_WR_BYTES = 4,
|
||||
+ QMP_RD_OPERATIONS = 5,
|
||||
+ QMP_WR_OPERATIONS = 6,
|
||||
+};
|
||||
+
|
||||
+enum query_block {
|
||||
+ QMP_BLOCK_RETURN = 0,
|
||||
+ QMP_BLOCK_DEVICE = 1,
|
||||
+ QMP_INSERTED = 2,
|
||||
+ QMP_FILE = 3,
|
||||
+};
|
||||
+
|
||||
+
|
||||
+/* Given the qmp device name, get the image filename associated with it
|
||||
+ QMP Syntax for querying block infomation:
|
||||
+ In: { "execute": "query-block" }
|
||||
+ Out: {"return": [{
|
||||
+ "device": 'str, "locked": 'bool', "removable": bool,
|
||||
+ "inserted": {
|
||||
+ "iops_rd": 'int',
|
||||
+ "image": {
|
||||
+ "virtual-size": 'int', "filename": 'str', "cluster-size": 'int',
|
||||
+ "format": 'str', "actual-size": 'int', "dirty-flag": 'bool'
|
||||
+ },
|
||||
+ "iops_wr": 'int', "ro": 'bool', "backing_file_depth": 'int',
|
||||
+ "drv": 'str', "iops": 'int', "bps_wr": 'int', "encrypted": 'bool',
|
||||
+ "bps": 'int', "bps_rd": 'int',
|
||||
+ "file": 'str', "encryption_key_missing": 'bool'
|
||||
+ },
|
||||
+ "type": 'str'
|
||||
+ }]}
|
||||
+*/
|
||||
+static char *qmp_get_block_image(xenstat_node *node, char *qmp_devname, int qfd)
|
||||
+{
|
||||
+ char *tmp, *file = NULL;
|
||||
+ char *query_block_cmd = "{ \"execute\": \"query-block\" }";
|
||||
+ static const char *const qblock[] = {
|
||||
+ [ QMP_BLOCK_RETURN ] = "return",
|
||||
+ [ QMP_BLOCK_DEVICE ] = "device",
|
||||
+ [ QMP_INSERTED ] = "inserted",
|
||||
+ [ QMP_FILE ] = "file",
|
||||
+ };
|
||||
+ const char *ptr[] = {0, 0};
|
||||
+ unsigned char *qmp_stats;
|
||||
+ yajl_val info, ret_obj, dev_obj, n;
|
||||
+ int i;
|
||||
+
|
||||
+ if ((qmp_stats = qmp_query(qfd, query_block_cmd)) == NULL)
|
||||
+ return NULL;
|
||||
+
|
||||
+ /* Use libyajl version 2.0.3 or newer for the tree parser feature with bug fixes */
|
||||
+ if ((info = yajl_tree_parse((char *)qmp_stats, NULL, 0)) == NULL) {
|
||||
+ free(qmp_stats);
|
||||
+ return NULL;
|
||||
+ }
|
||||
+
|
||||
+ ptr[0] = qblock[QMP_BLOCK_RETURN]; /* "return" */
|
||||
+ if ((ret_obj = yajl_tree_get(info, ptr, yajl_t_array)) == NULL)
|
||||
+ goto done;
|
||||
+
|
||||
+ for (i=0; i<YAJL_GET_ARRAY(ret_obj)->len; i++) {
|
||||
+ n = YAJL_GET_ARRAY(ret_obj)->values[i];
|
||||
+
|
||||
+ ptr[0] = qblock[QMP_BLOCK_DEVICE]; /* "device" */
|
||||
+ if ((dev_obj = yajl_tree_get(n, ptr, yajl_t_any)) != NULL) {
|
||||
+ tmp = YAJL_GET_STRING(dev_obj);
|
||||
+ if (strcmp(qmp_devname, tmp))
|
||||
+ continue;
|
||||
+ }
|
||||
+ else
|
||||
+ continue;
|
||||
+
|
||||
+ ptr[0] = qblock[QMP_INSERTED]; /* "inserted" */
|
||||
+ n = yajl_tree_get(n, ptr, yajl_t_any);
|
||||
+ if (n) {
|
||||
+ ptr[0] = qblock[QMP_FILE]; /* "file" */
|
||||
+ n = yajl_tree_get(n, ptr, yajl_t_any);
|
||||
+ if (n && YAJL_IS_STRING(n)) {
|
||||
+ tmp = YAJL_GET_STRING(n);
|
||||
+ file = malloc(strlen(tmp)+1);
|
||||
+ if (file != NULL)
|
||||
+ strcpy(file, tmp);
|
||||
+ goto done;
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+done:
|
||||
+ yajl_tree_free(info);
|
||||
+ return file;
|
||||
+}
|
||||
+
|
||||
+
|
||||
+/* Given a QMP device name, lookup the associated xenstore qdisk device id */
|
||||
+static void lookup_xenstore_devid(xenstat_node * node, unsigned int domid, char *qmp_devname,
|
||||
+ int qfd, unsigned int *dev, unsigned int *sector_size)
|
||||
+{
|
||||
+ char **dev_ids, *tmp, *ptr, *image, path[80];
|
||||
+ unsigned int num_dev_ids;
|
||||
+ int i, devid;
|
||||
+
|
||||
+ /* Get all the qdisk dev IDs associated with the this VM */
|
||||
+ snprintf(path, sizeof(path),"/local/domain/0/backend/qdisk/%i", domid);
|
||||
+ dev_ids = xs_directory(node->handle->xshandle, XBT_NULL, path, &num_dev_ids);
|
||||
+ if (dev_ids == NULL) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ /* Get the filename of the image associated with this QMP device */
|
||||
+ image = qmp_get_block_image(node, qmp_devname, qfd);
|
||||
+ if (image == NULL) {
|
||||
+ free(dev_ids);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ /* Look for a matching image in xenstore */
|
||||
+ for (i=0; i<num_dev_ids; i++) {
|
||||
+ devid = atoi(dev_ids[i]);
|
||||
+ /* Get the xenstore name of the image */
|
||||
+ snprintf(path, sizeof(path),"/local/domain/0/backend/qdisk/%i/%i/params", domid, devid);
|
||||
+ if ((ptr = xs_read(node->handle->xshandle, XBT_NULL, path, NULL)) == NULL)
|
||||
+ continue;
|
||||
+
|
||||
+ /* Get to actual path in string */
|
||||
+ if ((tmp = strchr(ptr, '/')) == NULL)
|
||||
+ tmp = ptr;
|
||||
+ if (!strcmp(tmp,image)) {
|
||||
+ *dev = devid;
|
||||
+ free(ptr);
|
||||
+
|
||||
+ /* Get the xenstore sector size of the image while we're here */
|
||||
+ snprintf(path, sizeof(path),"/local/domain/0/backend/qdisk/%i/%i/sector-size", domid, devid);
|
||||
+ if ((ptr = xs_read(node->handle->xshandle, XBT_NULL, path, NULL)) != NULL) {
|
||||
+ *sector_size = atoi((char *)ptr);
|
||||
+ free(ptr);
|
||||
+ }
|
||||
+ break;
|
||||
+ }
|
||||
+ free(ptr);
|
||||
+ }
|
||||
+
|
||||
+ free(image);
|
||||
+ free(dev_ids);
|
||||
+}
|
||||
+
|
||||
+/* Parse the stats buffer which contains I/O data for all the disks belonging to domid */
|
||||
+static void qmp_parse_stats(xenstat_node *node, unsigned int domid, unsigned char *stats_buf, int qfd)
|
||||
+{
|
||||
+ char *qmp_devname;
|
||||
+ static const char *const qstats[] = {
|
||||
+ [ QMP_STATS_RETURN ] = "return",
|
||||
+ [ QMP_STATS_DEVICE ] = "device",
|
||||
+ [ QMP_STATS ] = "stats",
|
||||
+ [ QMP_RD_BYTES ] = "rd_bytes",
|
||||
+ [ QMP_WR_BYTES ] = "wr_bytes",
|
||||
+ [ QMP_RD_OPERATIONS ] = "rd_operations",
|
||||
+ [ QMP_WR_OPERATIONS ] = "wr_operations",
|
||||
+ };
|
||||
+ const char *ptr[] = {0, 0};
|
||||
+ yajl_val info, ret_obj, stats_obj, n;
|
||||
+ xenstat_vbd vbd;
|
||||
+ xenstat_domain *domain;
|
||||
+ unsigned int sector_size = 512;
|
||||
+ int i, j;
|
||||
+
|
||||
+ /* Use libyajl version 2.0.3 or newer for the tree parser feature */
|
||||
+ if ((info = yajl_tree_parse((char *)stats_buf, NULL, 0)) == NULL)
|
||||
+ return;
|
||||
+
|
||||
+ ptr[0] = qstats[QMP_STATS_RETURN]; /* "return" */
|
||||
+ if ((ret_obj = yajl_tree_get(info, ptr, yajl_t_array)) == NULL)
|
||||
+ goto done;
|
||||
+
|
||||
+ /* Array of devices */
|
||||
+ for (i=0; i<YAJL_GET_ARRAY(ret_obj)->len; i++) {
|
||||
+ memset(&vbd, 0, sizeof(xenstat_vbd));
|
||||
+ qmp_devname = NULL;
|
||||
+ stats_obj = YAJL_GET_ARRAY(ret_obj)->values[i];
|
||||
+
|
||||
+ ptr[0] = qstats[QMP_STATS_DEVICE]; /* "device" */
|
||||
+ if ((n = yajl_tree_get(stats_obj, ptr, yajl_t_any)) != NULL)
|
||||
+ qmp_devname = YAJL_GET_STRING(n);
|
||||
+
|
||||
+ ptr[0] = qstats[QMP_STATS]; /* "stats" */
|
||||
+ stats_obj = yajl_tree_get(stats_obj, ptr, yajl_t_object);
|
||||
+ if (stats_obj && YAJL_IS_OBJECT(stats_obj)) {
|
||||
+ for (j=3; j<7; j++) {
|
||||
+ ptr[0] = qstats[j];
|
||||
+ n = yajl_tree_get(stats_obj, ptr, yajl_t_number);
|
||||
+ if (n && YAJL_IS_NUMBER(n)) {
|
||||
+ switch(j) {
|
||||
+ case QMP_RD_BYTES: /* "rd_bytes" */
|
||||
+ vbd.rd_sects = YAJL_GET_INTEGER(n) / sector_size;
|
||||
+ break;
|
||||
+ case QMP_WR_BYTES: /* "wr_bytes" */
|
||||
+ vbd.wr_sects = YAJL_GET_INTEGER(n) / sector_size;
|
||||
+ break;
|
||||
+ case QMP_RD_OPERATIONS: /* "rd_operations" */
|
||||
+ vbd.rd_reqs = YAJL_GET_INTEGER(n);
|
||||
+ break;
|
||||
+ case QMP_WR_OPERATIONS: /* "wr_operations" */
|
||||
+ vbd.wr_reqs = YAJL_GET_INTEGER(n);
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ /* With the QMP device name, lookup the xenstore qdisk device ID and set vdb.dev */
|
||||
+ if (qmp_devname)
|
||||
+ lookup_xenstore_devid(node, domid, qmp_devname, qfd, &vbd.dev, §or_size);
|
||||
+ if ((domain = xenstat_node_domain(node, domid)) == NULL)
|
||||
+ continue;
|
||||
+ if ((xenstat_save_vbd(domain, &vbd)) == NULL)
|
||||
+ goto done;
|
||||
+ }
|
||||
+ }
|
||||
+done:
|
||||
+ yajl_tree_free(info);
|
||||
+}
|
||||
+
|
||||
+/* Write a command via the QMP. Returns number of bytes written */
|
||||
+static size_t qmp_write(int qfd, char *cmd, size_t cmd_len)
|
||||
+{
|
||||
+ size_t pos = 0;
|
||||
+ ssize_t res;
|
||||
+
|
||||
+ while (cmd_len > pos) {
|
||||
+ res = write(qfd, cmd + pos, cmd_len - pos);
|
||||
+ switch (res) {
|
||||
+ case -1:
|
||||
+ if (errno == EINTR || errno == EAGAIN)
|
||||
+ continue;
|
||||
+ return 0;
|
||||
+ case 0:
|
||||
+ errno = EPIPE;
|
||||
+ return pos;
|
||||
+ default:
|
||||
+ pos += (size_t)res;
|
||||
+ }
|
||||
+ }
|
||||
+ return pos;
|
||||
+}
|
||||
+
|
||||
+/* Read the data sent in response to a QMP execute query. Returns 1 for success */
|
||||
+static int qmp_read(int qfd, unsigned char **qstats)
|
||||
+{
|
||||
+ unsigned char buf[1024], *ptr;
|
||||
+ struct pollfd pfd[2];
|
||||
+ int n, qsize = 0;
|
||||
+
|
||||
+ *qstats = NULL;
|
||||
+ pfd[0].fd = qfd;
|
||||
+ pfd[0].events = POLLIN;
|
||||
+ while ((n = poll(pfd, POLLIN, 10)) > 0) {
|
||||
+ if (pfd[0].revents & POLLIN) {
|
||||
+ if ((n = read(qfd, buf, sizeof(buf))) < 0) {
|
||||
+ free(*qstats);
|
||||
+ return 0;
|
||||
+ }
|
||||
+ ptr = realloc(*qstats, qsize+n+1);
|
||||
+ if (ptr == NULL) {
|
||||
+ free(*qstats);
|
||||
+ return 0;
|
||||
+ }
|
||||
+ memcpy(&ptr[qsize], buf, n);
|
||||
+ qsize += n;
|
||||
+ ptr[qsize] = 0;
|
||||
+ *qstats = ptr;
|
||||
+ }
|
||||
+ }
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+/* With the given cmd, query QMP for requested data. Returns allocated buffer containing data or NULL */
|
||||
+static unsigned char *qmp_query(int qfd, char *cmd)
|
||||
+{
|
||||
+ unsigned char *qstats = NULL;
|
||||
+ int n;
|
||||
+
|
||||
+ n = strlen(cmd);
|
||||
+ if (qmp_write(qfd, cmd, n) != n)
|
||||
+ return NULL;
|
||||
+ if (!qmp_read(qfd, &qstats))
|
||||
+ return NULL;
|
||||
+ return qstats;
|
||||
+}
|
||||
+
|
||||
+/* Returns a socket connected to the QMP socket. Returns -1 on failure. */
|
||||
+static int qmp_connect(char *path)
|
||||
+{
|
||||
+ struct sockaddr_un sun;
|
||||
+ int s;
|
||||
+
|
||||
+ if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
|
||||
+ return -1;
|
||||
+ (void)fcntl(s, F_SETFD, 1);
|
||||
+
|
||||
+ memset(&sun, 0, sizeof(struct sockaddr_un));
|
||||
+ sun.sun_family = AF_UNIX;
|
||||
+
|
||||
+ if (strlen(path) >= sizeof(sun.sun_path)) {
|
||||
+ close(s);
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ strcpy(sun.sun_path, path);
|
||||
+ if (connect(s, (struct sockaddr *)&sun, SUN_LEN(&sun)) < 0) {
|
||||
+ close(s);
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ return s;
|
||||
+}
|
||||
+
|
||||
+/* Get up to 1024 active domains */
|
||||
+static xc_domaininfo_t *get_domain_ids(int *num_doms)
|
||||
+{
|
||||
+ xc_domaininfo_t *dominfo;
|
||||
+ xc_interface *xc_handle;
|
||||
+
|
||||
+ dominfo = calloc(1024, sizeof(xc_domaininfo_t));
|
||||
+ if (dominfo == NULL)
|
||||
+ return NULL;
|
||||
+ xc_handle = xc_interface_open(0,0,0);
|
||||
+ *num_doms = xc_domain_getinfolist(xc_handle, 0, 1024, dominfo);
|
||||
+ xc_interface_close(xc_handle);
|
||||
+ return dominfo;
|
||||
+}
|
||||
+
|
||||
+/* Gather the qdisk statistics by querying QMP
|
||||
+ Resources: http://wiki.qemu.org/QMP and qmp-commands.hx from the qemu code
|
||||
+ QMP Syntax for entering command mode. This command must be issued before
|
||||
+ issuing any other command:
|
||||
+ In: {"execute": "qmp_capabilities"}
|
||||
+ Out: {"return": {}}
|
||||
+ QMP Syntax for querying block statistics:
|
||||
+ In: { "execute": "query-blockstats" }
|
||||
+ Out: {"return": [{
|
||||
+ "device": 'str',
|
||||
+ "parent": {
|
||||
+ "stats": {
|
||||
+ "flush_total_time_ns": 'int', "wr_highest_offset": 'int',
|
||||
+ "wr_total_time_ns": 'int', "wr_bytes": 'int',
|
||||
+ "rd_total_time_ns": 'int', "flush_operations": 'int',
|
||||
+ "wr_operations": 'int', "rd_bytes": 'int', "rd_operations": 'int'
|
||||
+ }
|
||||
+ },
|
||||
+ "stats": {
|
||||
+ "flush_total_time_ns": 'int', "wr_highest_offset": 'int',
|
||||
+ "wr_total_time_ns": 'int', "wr_bytes": 'int',
|
||||
+ "rd_total_time_ns": 'int', "flush_operations": 'int',
|
||||
+ "wr_operations": 'int', "rd_bytes": 'int', "rd_operations": 'int'
|
||||
+ }
|
||||
+ }]}
|
||||
+*/
|
||||
+void read_attributes_qdisk(xenstat_node * node)
|
||||
+{
|
||||
+ char *cmd_mode = "{ \"execute\": \"qmp_capabilities\" }";
|
||||
+ char *query_blockstats_cmd = "{ \"execute\": \"query-blockstats\" }";
|
||||
+ xc_domaininfo_t *dominfo = NULL;
|
||||
+ unsigned char *qmp_stats, *val;
|
||||
+ char path[80];
|
||||
+ int i, qfd, num_doms;
|
||||
+
|
||||
+ dominfo = get_domain_ids(&num_doms);
|
||||
+ if (dominfo == NULL)
|
||||
+ return;
|
||||
+
|
||||
+ for (i=0; i<num_doms; i++) {
|
||||
+ if (dominfo[i].domain <= 0)
|
||||
+ continue;
|
||||
+
|
||||
+ /* Verify that qdisk disks are used with this VM */
|
||||
+ snprintf(path, sizeof(path),"/local/domain/0/backend/qdisk/%i", dominfo[i].domain);
|
||||
+ if ((val = xs_read(node->handle->xshandle, XBT_NULL, path, NULL)) == NULL)
|
||||
+ continue;
|
||||
+ free(val);
|
||||
+
|
||||
+ /* Connect to this VMs QMP socket */
|
||||
+ snprintf(path, sizeof(path), "/var/run/xen/qmp-libxenstat-%i", dominfo[i].domain);
|
||||
+ if ((qfd = qmp_connect(path)) < 0) {
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ /* First enable QMP capabilities so that we can query for data */
|
||||
+ if ((qmp_stats = qmp_query(qfd, cmd_mode)) != NULL) {
|
||||
+ free(qmp_stats);
|
||||
+ /* Query QMP for this VMs blockstats */
|
||||
+ if ((qmp_stats = qmp_query(qfd, query_blockstats_cmd)) != NULL) {
|
||||
+ qmp_parse_stats(node, dominfo[i].domain, qmp_stats, qfd);
|
||||
+ free(qmp_stats);
|
||||
+ }
|
||||
+ }
|
||||
+ close(qfd);
|
||||
+ }
|
||||
+
|
||||
+ free(dominfo);
|
||||
+}
|
||||
+
|
||||
+#else /* !HAVE_YAJL_V2 */
|
||||
+
|
||||
+/* Statistics gathering for qdisks requires at least yajl v2 */
|
||||
+void read_attributes_qdisk(xenstat_node * node)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+#endif /* !HAVE_YAJL_V2 */
|
@ -1,24 +0,0 @@
|
||||
# Commit e59abf8c8c9c1d99a531292c6a548d6dfd0ceacc
|
||||
# Date 2015-04-14 14:59:53 +0200
|
||||
# Author Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/traps: identify the vcpu in context when dumping registers
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/x86_64/traps.c
|
||||
+++ b/xen/arch/x86/x86_64/traps.c
|
||||
@@ -53,9 +53,11 @@ static void _show_registers(
|
||||
printk("\nRFLAGS: %016lx ", regs->rflags);
|
||||
if ( (context == CTXT_pv_guest) && v && v->vcpu_info )
|
||||
printk("EM: %d ", !!vcpu_info(v, evtchn_upcall_mask));
|
||||
- printk("CONTEXT: %s\n", context_names[context]);
|
||||
+ printk("CONTEXT: %s", context_names[context]);
|
||||
+ if ( v && !is_idle_vcpu(v) )
|
||||
+ printk(" (%pv)", v);
|
||||
|
||||
- printk("rax: %016lx rbx: %016lx rcx: %016lx\n",
|
||||
+ printk("\nrax: %016lx rbx: %016lx rcx: %016lx\n",
|
||||
regs->rax, regs->rbx, regs->rcx);
|
||||
printk("rdx: %016lx rsi: %016lx rdi: %016lx\n",
|
||||
regs->rdx, regs->rsi, regs->rdi);
|
@ -1,41 +0,0 @@
|
||||
# Commit 63dcef9fe5b880007075b5eb53f9950a826519ce
|
||||
# Date 2015-04-14 15:02:10 +0200
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/hvm: don't include asm/spinlock.h
|
||||
|
||||
asm/spinlock.h should not be included directly.
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/hvm.c
|
||||
+++ b/xen/arch/x86/hvm/hvm.c
|
||||
@@ -52,7 +52,6 @@
|
||||
#include <asm/xstate.h>
|
||||
#include <asm/traps.h>
|
||||
#include <asm/mc146818rtc.h>
|
||||
-#include <asm/spinlock.h>
|
||||
#include <asm/mce.h>
|
||||
#include <asm/hvm/hvm.h>
|
||||
#include <asm/hvm/vpt.h>
|
||||
--- a/xen/arch/x86/hvm/svm/svm.c
|
||||
+++ b/xen/arch/x86/hvm/svm/svm.c
|
||||
@@ -41,7 +41,6 @@
|
||||
#include <asm/msr.h>
|
||||
#include <asm/i387.h>
|
||||
#include <asm/iocap.h>
|
||||
-#include <asm/spinlock.h>
|
||||
#include <asm/hvm/emulate.h>
|
||||
#include <asm/hvm/hvm.h>
|
||||
#include <asm/hvm/support.h>
|
||||
--- a/xen/arch/x86/hvm/vmx/vmx.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vmx.c
|
||||
@@ -35,7 +35,6 @@
|
||||
#include <asm/types.h>
|
||||
#include <asm/debugreg.h>
|
||||
#include <asm/msr.h>
|
||||
-#include <asm/spinlock.h>
|
||||
#include <asm/paging.h>
|
||||
#include <asm/p2m.h>
|
||||
#include <asm/mem_sharing.h>
|
@ -1,22 +0,0 @@
|
||||
# Commit f70df9ec1ab72b6bbebad72d81109c1b214007e1
|
||||
# Date 2015-04-14 15:02:32 +0200
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/mtrr: include asm/atomic.h
|
||||
|
||||
asm/atomic.h is needed but only included indirectly via
|
||||
asm/spinlock.h.
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/cpu/mtrr/main.c
|
||||
+++ b/xen/arch/x86/cpu/mtrr/main.c
|
||||
@@ -36,6 +36,7 @@
|
||||
#include <xen/lib.h>
|
||||
#include <xen/smp.h>
|
||||
#include <xen/spinlock.h>
|
||||
+#include <asm/atomic.h>
|
||||
#include <asm/mtrr.h>
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/processor.h>
|
@ -1,46 +0,0 @@
|
||||
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
|
||||
|
||||
# Commit 70a3cbb8c9cb17a61fa25c48ba3d7b44fd059c90
|
||||
# Date 2015-04-14 16:50:35 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/vMSI-X: honor all mask requests
|
||||
|
||||
Commit 74fd0036de ("x86: properly handle MSI-X unmask operation from
|
||||
guests") didn't go far enough: it fixed an issue with unmasking, but
|
||||
left an issue with masking in place: Due to the (late) point in time
|
||||
when qemu requests the hypervisor to set up MSI-X interrupts (which is
|
||||
where the MMIO intercept gets put in place), the hypervisor doesn't
|
||||
see all guest writes, and hence shouldn't make assumptions on the state
|
||||
the virtual MSI-X resources are in. Bypassing the rest of the logic on
|
||||
a guest mask operation leads to
|
||||
|
||||
[00:04.0] pci_msix_write: Error: Can't update msix entry 1 since MSI-X is already enabled.
|
||||
|
||||
which surprisingly enough doesn't lead to the device not working
|
||||
anymore (I didn't dig in deep enough to figure out why that is). But it
|
||||
does prevent the IRQ to be migrated inside the guest, i.e. all
|
||||
interrupts will always arrive in vCPU 0.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/vmsi.c
|
||||
+++ b/xen/arch/x86/hvm/vmsi.c
|
||||
@@ -286,11 +286,11 @@ static int msixtbl_write(struct vcpu *v,
|
||||
goto out;
|
||||
}
|
||||
|
||||
- /* exit to device model if address/data has been modified */
|
||||
- if ( test_and_clear_bit(nr_entry, &entry->table_flags) )
|
||||
+ /* Exit to device model when unmasking and address/data got modified. */
|
||||
+ if ( !(val & PCI_MSIX_VECTOR_BITMASK) &&
|
||||
+ test_and_clear_bit(nr_entry, &entry->table_flags) )
|
||||
{
|
||||
- if ( !(val & PCI_MSIX_VECTOR_BITMASK) )
|
||||
- v->arch.hvm_vcpu.hvm_io.msix_unmask_address = address;
|
||||
+ v->arch.hvm_vcpu.hvm_io.msix_unmask_address = address;
|
||||
goto out;
|
||||
}
|
||||
|
@ -1,58 +0,0 @@
|
||||
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
|
||||
|
||||
# Commit df9f5676b3711c95127d44e871ad7ca38d6ed28a
|
||||
# Date 2015-04-14 16:51:18 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/vMSI-X: add valid bits for read acceleration
|
||||
|
||||
Again because Xen doesn't get to see all guest writes, it shouldn't
|
||||
serve reads from its cache before having seen a write to the respective
|
||||
address.
|
||||
|
||||
Also use DECLARE_BITMAP() in a related field declaration instead of
|
||||
open coding it.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/vmsi.c
|
||||
+++ b/xen/arch/x86/hvm/vmsi.c
|
||||
@@ -154,11 +154,14 @@ struct msixtbl_entry
|
||||
struct pci_dev *pdev;
|
||||
unsigned long gtable; /* gpa of msix table */
|
||||
unsigned long table_len;
|
||||
- unsigned long table_flags[BITS_TO_LONGS(MAX_MSIX_TABLE_ENTRIES)];
|
||||
+ DECLARE_BITMAP(table_flags, MAX_MSIX_TABLE_ENTRIES);
|
||||
#define MAX_MSIX_ACC_ENTRIES 3
|
||||
struct {
|
||||
uint32_t msi_ad[3]; /* Shadow of address low, high and data */
|
||||
} gentries[MAX_MSIX_ACC_ENTRIES];
|
||||
+ DECLARE_BITMAP(acc_valid, 3 * MAX_MSIX_ACC_ENTRIES);
|
||||
+#define acc_bit(what, ent, slot, idx) \
|
||||
+ what##_bit((slot) * 3 + (idx), (ent)->acc_valid)
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
@@ -233,9 +236,10 @@ static int msixtbl_read(
|
||||
if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
|
||||
{
|
||||
nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
|
||||
- if ( nr_entry >= MAX_MSIX_ACC_ENTRIES )
|
||||
- goto out;
|
||||
index = offset / sizeof(uint32_t);
|
||||
+ if ( nr_entry >= MAX_MSIX_ACC_ENTRIES ||
|
||||
+ !acc_bit(test, entry, nr_entry, index) )
|
||||
+ goto out;
|
||||
*pval = entry->gentries[nr_entry].msi_ad[index];
|
||||
}
|
||||
else
|
||||
@@ -281,6 +285,7 @@ static int msixtbl_write(struct vcpu *v,
|
||||
{
|
||||
index = offset / sizeof(uint32_t);
|
||||
entry->gentries[nr_entry].msi_ad[index] = val;
|
||||
+ acc_bit(set, entry, nr_entry, index);
|
||||
}
|
||||
set_bit(nr_entry, &entry->table_flags);
|
||||
goto out;
|
@ -1,27 +0,0 @@
|
||||
References: bsc#945164
|
||||
|
||||
Subject: libxl: use DEBUG log level instead of INFO
|
||||
From: Wei Liu wei.liu2@citrix.com Fri Apr 17 12:31:29 2015 +0100
|
||||
Date: Wed Apr 22 14:40:40 2015 +0100:
|
||||
Git: ddc17f311099c1f0f37a771a2f5f904d848102f7
|
||||
|
||||
Make libxl less noisy when destroying a domain.
|
||||
|
||||
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
|
||||
Cc: Ian Campbell <ian.campbell@citrix.com>
|
||||
Cc: Ian Jackson <ian.jackson@eu.citrix.com>
|
||||
Acked-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
|
||||
Index: xen-4.5.1-testing/tools/libxl/libxl.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/libxl/libxl.c
|
||||
+++ xen-4.5.1-testing/tools/libxl/libxl.c
|
||||
@@ -1695,7 +1695,7 @@ static void devices_destroy_cb(libxl__eg
|
||||
_exit(-1);
|
||||
}
|
||||
}
|
||||
- LOG(INFO, "forked pid %ld for destroy of domain %d", (long)rc, domid);
|
||||
+ LOG(DEBUG, "forked pid %ld for destroy of domain %d", (long)rc, domid);
|
||||
|
||||
return;
|
||||
|
@ -1,33 +0,0 @@
|
||||
# Commit 017e667c433a1040306db6265b05e104568c70c8
|
||||
# Date 2015-05-05 18:00:03 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
domctl: don't truncate XEN_DOMCTL_max_mem requests
|
||||
|
||||
Instead saturate the value if the input can't be represented in the
|
||||
respective struct domain field.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Tim Deegan <tim@xen.org>
|
||||
|
||||
--- a/xen/common/domctl.c
|
||||
+++ b/xen/common/domctl.c
|
||||
@@ -943,7 +943,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
|
||||
|
||||
case XEN_DOMCTL_max_mem:
|
||||
{
|
||||
- unsigned long new_max;
|
||||
+ uint64_t new_max;
|
||||
|
||||
ret = -EINVAL;
|
||||
new_max = op->u.max_mem.max_memkb >> (PAGE_SHIFT-10);
|
||||
@@ -954,7 +954,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
|
||||
* that the domain will now be allowed to "ratchet" down to new_max. In
|
||||
* the meantime, while tot > max, all new allocations are disallowed.
|
||||
*/
|
||||
- d->max_pages = new_max;
|
||||
+ d->max_pages = min(new_max, (uint64_t)(typeof(d->max_pages))-1);
|
||||
ret = 0;
|
||||
spin_unlock(&d->page_alloc_lock);
|
||||
}
|
@ -1,250 +0,0 @@
|
||||
# Commit d72a4605e18d3a61c4469ff092dbbbfa4ac919f7
|
||||
# Date 2015-05-05 18:01:33 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: allow 64-bit PV guest kernels to suppress user mode exposure of M2P
|
||||
|
||||
Xen L4 entries being uniformly installed into any L4 table and 64-bit
|
||||
PV kernels running in ring 3 means that user mode was able to see the
|
||||
read-only M2P presented by Xen to the guests. While apparently not
|
||||
really representing an exploitable information leak, this still very
|
||||
certainly was never meant to be that way.
|
||||
|
||||
Building on the fact that these guests already have separate kernel and
|
||||
user mode page tables we can allow guest kernels to tell Xen that they
|
||||
don't want user mode to see this table. We can't, however, do this by
|
||||
default: There is no ABI requirement that kernel and user mode page
|
||||
tables be separate. Therefore introduce a new VM-assist flag allowing
|
||||
the guest to control respective hypervisor behavior:
|
||||
- when not set, L4 tables get created with the respective slot blank,
|
||||
and whenever the L4 table gets used as a kernel one the missing
|
||||
mapping gets inserted,
|
||||
- when set, L4 tables get created with the respective slot initialized
|
||||
as before, and whenever the L4 table gets used as a user one the
|
||||
mapping gets zapped.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: Tim Deegan <tim@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/domain.c
|
||||
+++ b/xen/arch/x86/domain.c
|
||||
@@ -338,7 +338,7 @@ static int setup_compat_l4(struct vcpu *
|
||||
|
||||
l4tab = __map_domain_page(pg);
|
||||
clear_page(l4tab);
|
||||
- init_guest_l4_table(l4tab, v->domain);
|
||||
+ init_guest_l4_table(l4tab, v->domain, 1);
|
||||
unmap_domain_page(l4tab);
|
||||
|
||||
v->arch.guest_table = pagetable_from_page(pg);
|
||||
@@ -977,7 +977,11 @@ int arch_set_info_guest(
|
||||
case -EINTR:
|
||||
rc = -ERESTART;
|
||||
case -ERESTART:
|
||||
+ break;
|
||||
case 0:
|
||||
+ if ( !compat && !VM_ASSIST(d, VMASST_TYPE_m2p_strict) &&
|
||||
+ !paging_mode_refcounts(d) )
|
||||
+ fill_ro_mpt(cr3_gfn);
|
||||
break;
|
||||
default:
|
||||
if ( cr3_page == current->arch.old_guest_table )
|
||||
@@ -1012,7 +1016,10 @@ int arch_set_info_guest(
|
||||
default:
|
||||
if ( cr3_page == current->arch.old_guest_table )
|
||||
cr3_page = NULL;
|
||||
+ break;
|
||||
case 0:
|
||||
+ if ( VM_ASSIST(d, VMASST_TYPE_m2p_strict) )
|
||||
+ zap_ro_mpt(cr3_gfn);
|
||||
break;
|
||||
}
|
||||
}
|
||||
--- a/xen/arch/x86/domain_build.c
|
||||
+++ b/xen/arch/x86/domain_build.c
|
||||
@@ -1092,7 +1092,7 @@ int __init construct_dom0(
|
||||
l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
|
||||
}
|
||||
clear_page(l4tab);
|
||||
- init_guest_l4_table(l4tab, d);
|
||||
+ init_guest_l4_table(l4tab, d, 0);
|
||||
v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
|
||||
if ( is_pv_32on64_domain(d) )
|
||||
v->arch.guest_table_user = v->arch.guest_table;
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -1380,7 +1380,8 @@ static int alloc_l3_table(struct page_in
|
||||
return rc > 0 ? 0 : rc;
|
||||
}
|
||||
|
||||
-void init_guest_l4_table(l4_pgentry_t l4tab[], const struct domain *d)
|
||||
+void init_guest_l4_table(l4_pgentry_t l4tab[], const struct domain *d,
|
||||
+ bool_t zap_ro_mpt)
|
||||
{
|
||||
/* Xen private mappings. */
|
||||
memcpy(&l4tab[ROOT_PAGETABLE_FIRST_XEN_SLOT],
|
||||
@@ -1395,6 +1396,25 @@ void init_guest_l4_table(l4_pgentry_t l4
|
||||
l4e_from_pfn(domain_page_map_to_mfn(l4tab), __PAGE_HYPERVISOR);
|
||||
l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
|
||||
l4e_from_page(d->arch.perdomain_l3_pg, __PAGE_HYPERVISOR);
|
||||
+ if ( zap_ro_mpt || is_pv_32on64_domain(d) || paging_mode_refcounts(d) )
|
||||
+ l4tab[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty();
|
||||
+}
|
||||
+
|
||||
+void fill_ro_mpt(unsigned long mfn)
|
||||
+{
|
||||
+ l4_pgentry_t *l4tab = map_domain_page(mfn);
|
||||
+
|
||||
+ l4tab[l4_table_offset(RO_MPT_VIRT_START)] =
|
||||
+ idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)];
|
||||
+ unmap_domain_page(l4tab);
|
||||
+}
|
||||
+
|
||||
+void zap_ro_mpt(unsigned long mfn)
|
||||
+{
|
||||
+ l4_pgentry_t *l4tab = map_domain_page(mfn);
|
||||
+
|
||||
+ l4tab[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty();
|
||||
+ unmap_domain_page(l4tab);
|
||||
}
|
||||
|
||||
static int alloc_l4_table(struct page_info *page)
|
||||
@@ -1444,7 +1464,7 @@ static int alloc_l4_table(struct page_in
|
||||
adjust_guest_l4e(pl4e[i], d);
|
||||
}
|
||||
|
||||
- init_guest_l4_table(pl4e, d);
|
||||
+ init_guest_l4_table(pl4e, d, !VM_ASSIST(d, VMASST_TYPE_m2p_strict));
|
||||
unmap_domain_page(pl4e);
|
||||
|
||||
return rc > 0 ? 0 : rc;
|
||||
@@ -2755,6 +2775,8 @@ int new_guest_cr3(unsigned long mfn)
|
||||
|
||||
invalidate_shadow_ldt(curr, 0);
|
||||
|
||||
+ if ( !VM_ASSIST(d, VMASST_TYPE_m2p_strict) && !paging_mode_refcounts(d) )
|
||||
+ fill_ro_mpt(mfn);
|
||||
curr->arch.guest_table = pagetable_from_pfn(mfn);
|
||||
update_cr3(curr);
|
||||
|
||||
@@ -3111,6 +3133,9 @@ long do_mmuext_op(
|
||||
op.arg1.mfn);
|
||||
break;
|
||||
}
|
||||
+ if ( VM_ASSIST(d, VMASST_TYPE_m2p_strict) &&
|
||||
+ !paging_mode_refcounts(d) )
|
||||
+ zap_ro_mpt(op.arg1.mfn);
|
||||
}
|
||||
|
||||
curr->arch.guest_table_user = pagetable_from_pfn(op.arg1.mfn);
|
||||
--- a/xen/arch/x86/mm/shadow/multi.c
|
||||
+++ b/xen/arch/x86/mm/shadow/multi.c
|
||||
@@ -1438,6 +1438,13 @@ void sh_install_xen_entries_in_l4(struct
|
||||
shadow_l4e_from_mfn(page_to_mfn(d->arch.perdomain_l3_pg),
|
||||
__PAGE_HYPERVISOR);
|
||||
|
||||
+ if ( !shadow_mode_external(d) && !is_pv_32on64_domain(d) &&
|
||||
+ !VM_ASSIST(d, VMASST_TYPE_m2p_strict) )
|
||||
+ {
|
||||
+ /* open coded zap_ro_mpt(mfn_x(sl4mfn)): */
|
||||
+ sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] = shadow_l4e_empty();
|
||||
+ }
|
||||
+
|
||||
/* Shadow linear mapping for 4-level shadows. N.B. for 3-level
|
||||
* shadows on 64-bit xen, this linear mapping is later replaced by the
|
||||
* monitor pagetable structure, which is built in make_monitor_table
|
||||
@@ -4062,6 +4069,17 @@ sh_update_cr3(struct vcpu *v, int do_loc
|
||||
if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 )
|
||||
flush_tlb_mask(d->domain_dirty_cpumask);
|
||||
sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
|
||||
+ if ( !shadow_mode_external(d) && !is_pv_32on64_domain(d) )
|
||||
+ {
|
||||
+ mfn_t smfn = pagetable_get_mfn(v->arch.shadow_table[0]);
|
||||
+
|
||||
+ if ( !(v->arch.flags & TF_kernel_mode) &&
|
||||
+ VM_ASSIST(d, VMASST_TYPE_m2p_strict) )
|
||||
+ zap_ro_mpt(mfn_x(smfn));
|
||||
+ else if ( (v->arch.flags & TF_kernel_mode) &&
|
||||
+ !VM_ASSIST(d, VMASST_TYPE_m2p_strict) )
|
||||
+ fill_ro_mpt(mfn_x(smfn));
|
||||
+ }
|
||||
#else
|
||||
#error This should never happen
|
||||
#endif
|
||||
--- a/xen/arch/x86/x86_64/mm.c
|
||||
+++ b/xen/arch/x86/x86_64/mm.c
|
||||
@@ -480,7 +480,7 @@ static int setup_m2p_table(struct mem_ho
|
||||
l2_ro_mpt += l2_table_offset(va);
|
||||
}
|
||||
|
||||
- /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
|
||||
+ /* NB. Cannot be GLOBAL: guest user mode should not see it. */
|
||||
l2e_write(l2_ro_mpt, l2e_from_pfn(mfn,
|
||||
/*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
|
||||
}
|
||||
@@ -583,7 +583,7 @@ void __init paging_init(void)
|
||||
0x77, 1UL << L3_PAGETABLE_SHIFT);
|
||||
|
||||
ASSERT(!l2_table_offset(va));
|
||||
- /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
|
||||
+ /* NB. Cannot be GLOBAL: guest user mode should not see it. */
|
||||
l3e_write(&l3_ro_mpt[l3_table_offset(va)],
|
||||
l3e_from_page(l1_pg,
|
||||
/*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
|
||||
@@ -621,7 +621,7 @@ void __init paging_init(void)
|
||||
l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
|
||||
ASSERT(!l2_table_offset(va));
|
||||
}
|
||||
- /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
|
||||
+ /* NB. Cannot be GLOBAL: guest user mode should not see it. */
|
||||
if ( l1_pg )
|
||||
l2e_write(l2_ro_mpt, l2e_from_page(
|
||||
l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
|
||||
--- a/xen/include/asm-x86/config.h
|
||||
+++ b/xen/include/asm-x86/config.h
|
||||
@@ -330,7 +330,8 @@ extern unsigned long xen_phys_start;
|
||||
#define NATIVE_VM_ASSIST_VALID ((1UL << VMASST_TYPE_4gb_segments) | \
|
||||
(1UL << VMASST_TYPE_4gb_segments_notify) | \
|
||||
(1UL << VMASST_TYPE_writable_pagetables) | \
|
||||
- (1UL << VMASST_TYPE_pae_extended_cr3))
|
||||
+ (1UL << VMASST_TYPE_pae_extended_cr3) | \
|
||||
+ (1UL << VMASST_TYPE_m2p_strict))
|
||||
#define VM_ASSIST_VALID NATIVE_VM_ASSIST_VALID
|
||||
#define COMPAT_VM_ASSIST_VALID (NATIVE_VM_ASSIST_VALID & \
|
||||
((1UL << COMPAT_BITS_PER_LONG) - 1))
|
||||
--- a/xen/include/asm-x86/mm.h
|
||||
+++ b/xen/include/asm-x86/mm.h
|
||||
@@ -314,7 +314,10 @@ static inline void *__page_to_virt(const
|
||||
int free_page_type(struct page_info *page, unsigned long type,
|
||||
int preemptible);
|
||||
|
||||
-void init_guest_l4_table(l4_pgentry_t[], const struct domain *);
|
||||
+void init_guest_l4_table(l4_pgentry_t[], const struct domain *,
|
||||
+ bool_t zap_ro_mpt);
|
||||
+void fill_ro_mpt(unsigned long mfn);
|
||||
+void zap_ro_mpt(unsigned long mfn);
|
||||
|
||||
int is_iomem_page(unsigned long mfn);
|
||||
|
||||
--- a/xen/include/public/xen.h
|
||||
+++ b/xen/include/public/xen.h
|
||||
@@ -486,6 +486,18 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
|
||||
/* x86/PAE guests: support PDPTs above 4GB. */
|
||||
#define VMASST_TYPE_pae_extended_cr3 3
|
||||
|
||||
+/*
|
||||
+ * x86/64 guests: strictly hide M2P from user mode.
|
||||
+ * This allows the guest to control respective hypervisor behavior:
|
||||
+ * - when not set, L4 tables get created with the respective slot blank,
|
||||
+ * and whenever the L4 table gets used as a kernel one the missing
|
||||
+ * mapping gets inserted,
|
||||
+ * - when set, L4 tables get created with the respective slot initialized
|
||||
+ * as before, and whenever the L4 table gets used as a user one the
|
||||
+ * mapping gets zapped.
|
||||
+ */
|
||||
+#define VMASST_TYPE_m2p_strict 32
|
||||
+
|
||||
#if __XEN_INTERFACE_VERSION__ < 0x00040600
|
||||
#define MAX_VMASST_TYPE 3
|
||||
#endif
|
@ -1,68 +0,0 @@
|
||||
# Commit 2bfc9fc52ce8485fa43e79bbdc32360c74e12fe8
|
||||
# Date 2015-05-08 10:59:26 +0200
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: provide arch_fetch_and_add()
|
||||
|
||||
arch_fetch_and_add() atomically adds a value and returns the previous
|
||||
value.
|
||||
|
||||
This is needed to implement ticket locks.
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
|
||||
--- a/xen/include/asm-x86/system.h
|
||||
+++ b/xen/include/asm-x86/system.h
|
||||
@@ -118,6 +118,52 @@ static always_inline unsigned long __cmp
|
||||
})
|
||||
|
||||
/*
|
||||
+ * Undefined symbol to cause link failure if a wrong size is used with
|
||||
+ * arch_fetch_and_add().
|
||||
+ */
|
||||
+extern unsigned long __bad_fetch_and_add_size(void);
|
||||
+
|
||||
+static always_inline unsigned long __xadd(
|
||||
+ volatile void *ptr, unsigned long v, int size)
|
||||
+{
|
||||
+ switch ( size )
|
||||
+ {
|
||||
+ case 1:
|
||||
+ asm volatile ( "lock; xaddb %b0,%1"
|
||||
+ : "+r" (v), "+m" (*__xg(ptr))
|
||||
+ :: "memory");
|
||||
+ return v;
|
||||
+ case 2:
|
||||
+ asm volatile ( "lock; xaddw %w0,%1"
|
||||
+ : "+r" (v), "+m" (*__xg(ptr))
|
||||
+ :: "memory");
|
||||
+ return v;
|
||||
+ case 4:
|
||||
+ asm volatile ( "lock; xaddl %k0,%1"
|
||||
+ : "+r" (v), "+m" (*__xg(ptr))
|
||||
+ :: "memory");
|
||||
+ return v;
|
||||
+ case 8:
|
||||
+ asm volatile ( "lock; xaddq %q0,%1"
|
||||
+ : "+r" (v), "+m" (*__xg(ptr))
|
||||
+ :: "memory");
|
||||
+
|
||||
+ return v;
|
||||
+ default:
|
||||
+ return __bad_fetch_and_add_size();
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Atomically add @v to the 1, 2, 4, or 8 byte value at @ptr. Returns
|
||||
+ * the previous value.
|
||||
+ *
|
||||
+ * This is a full memory barrier.
|
||||
+ */
|
||||
+#define arch_fetch_and_add(ptr, v) \
|
||||
+ ((typeof(*(ptr)))__xadd(ptr, (typeof(*(ptr)))(v), sizeof(*(ptr))))
|
||||
+
|
||||
+/*
|
||||
* Both Intel and AMD agree that, from a programmer's viewpoint:
|
||||
* Loads cannot be reordered relative to other loads.
|
||||
* Stores cannot be reordered relative to other stores.
|
@ -1,29 +0,0 @@
|
||||
# Commit f9cc3cd9b4de58cf032c8624406384c172937e57
|
||||
# Date 2015-05-08 10:59:44 +0200
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
arm: provide arch_fetch_and_add()
|
||||
|
||||
arch_fetch_and_add() atomically adds a value and returns the previous
|
||||
value.
|
||||
|
||||
This generic arm implementation uses the GCC __sync_fetch_and_add()
|
||||
builtin. This builtin resulted in suitable inlined asm for GCC 4.8.3
|
||||
(arm64) and GCC 4.6.3 (arm32).
|
||||
|
||||
This is needed to implement ticket locks.
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
Acked-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
|
||||
--- a/xen/include/asm-arm/system.h
|
||||
+++ b/xen/include/asm-arm/system.h
|
||||
@@ -51,6 +51,8 @@
|
||||
# error "unknown ARM variant"
|
||||
#endif
|
||||
|
||||
+#define arch_fetch_and_add(x, v) __sync_fetch_and_add(x, v)
|
||||
+
|
||||
extern struct vcpu *__context_switch(struct vcpu *prev, struct vcpu *next);
|
||||
|
||||
#endif
|
@ -1,155 +0,0 @@
|
||||
commit 161212ef02312c0681d2d809c8ff1e1f0ea6f6f9
|
||||
Author: Fabio Fantoni <fabio.fantoni@m2r.biz>
|
||||
Date: Wed Apr 29 11:20:28 2015 +0200
|
||||
|
||||
libxl: Add qxl vga interface support for upstream qemu
|
||||
|
||||
Usage:
|
||||
vga="qxl"
|
||||
|
||||
Qxl vga support many resolutions that not supported by stdvga,
|
||||
mainly the 16:9 ones and other high up to 2560x1600.
|
||||
With QXL you can get improved performance and smooth video also
|
||||
with high resolutions and high quality.
|
||||
Require their drivers installed in the domU and spice used
|
||||
otherwise act as a simple stdvga.
|
||||
|
||||
Signed-off-by: Fabio Fantoni <fabio.fantoni@m2r.biz>
|
||||
Signed-off-by: Zhou Peng <zpengxen@gmail.com>
|
||||
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
|
||||
Acked-by: Ian Jackson <ian.jackson@eu.citrix.com>
|
||||
Acked-by: George Dunlap <george.dunlap@eu.citrix.com>
|
||||
|
||||
Index: xen-4.5.1-testing/docs/man/xl.cfg.pod.5
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/docs/man/xl.cfg.pod.5
|
||||
+++ xen-4.5.1-testing/docs/man/xl.cfg.pod.5
|
||||
@@ -1292,6 +1292,9 @@ qemu-xen-traditional device-model, the a
|
||||
which is sufficient for 1024x768 at 32 bpp. For the upstream qemu-xen
|
||||
device-model, the default and minimum is 8 MB.
|
||||
|
||||
+For B<qxl> vga, the default is both default and minimal 128MB.
|
||||
+If B<videoram> is set less than 128MB, an error will be triggered.
|
||||
+
|
||||
=item B<stdvga=BOOLEAN>
|
||||
|
||||
Select a standard VGA card with VBE (VESA BIOS Extensions) as the
|
||||
@@ -1303,9 +1306,14 @@ This option is deprecated, use vga="stdv
|
||||
|
||||
=item B<vga="STRING">
|
||||
|
||||
-Selects the emulated video card (none|stdvga|cirrus).
|
||||
+Selects the emulated video card (none|stdvga|cirrus|qxl).
|
||||
The default is cirrus.
|
||||
|
||||
+In general, QXL should work with the Spice remote display protocol
|
||||
+for acceleration, and QXL driver is necessary in guest in this case.
|
||||
+QXL can also work with the VNC protocol, but it will be like a standard
|
||||
+VGA without acceleration.
|
||||
+
|
||||
=item B<vnc=BOOLEAN>
|
||||
|
||||
Allow access to the display via the VNC protocol. This enables the
|
||||
Index: xen-4.5.1-testing/tools/libxl/libxl.h
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/libxl/libxl.h
|
||||
+++ xen-4.5.1-testing/tools/libxl/libxl.h
|
||||
@@ -506,6 +506,16 @@ typedef struct libxl__ctx libxl_ctx;
|
||||
#define LIBXL_HAVE_DOMINFO_OUTSTANDING_MEMKB 1
|
||||
|
||||
/*
|
||||
+ * LIBXL_HAVE_QXL
|
||||
+ *
|
||||
+ * If defined, then the libxl_vga_interface_type will contain another value:
|
||||
+ * "QXL". This value define if qxl vga is supported.
|
||||
+ *
|
||||
+ * If this is not defined, the qxl vga support is missed.
|
||||
+ */
|
||||
+#define LIBXL_HAVE_QXL 1
|
||||
+
|
||||
+/*
|
||||
* LIBXL_HAVE_SPICE_VDAGENT
|
||||
*
|
||||
* If defined, then the libxl_spice_info structure will contain a boolean type:
|
||||
Index: xen-4.5.1-testing/tools/libxl/libxl_create.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/libxl/libxl_create.c
|
||||
+++ xen-4.5.1-testing/tools/libxl/libxl_create.c
|
||||
@@ -240,6 +240,10 @@ int libxl__domain_build_info_setdefault(
|
||||
if (b_info->video_memkb == LIBXL_MEMKB_DEFAULT)
|
||||
b_info->video_memkb = 0;
|
||||
break;
|
||||
+ case LIBXL_VGA_INTERFACE_TYPE_QXL:
|
||||
+ LOG(ERROR,"qemu upstream required for qxl vga");
|
||||
+ return ERROR_INVAL;
|
||||
+ break;
|
||||
case LIBXL_VGA_INTERFACE_TYPE_STD:
|
||||
if (b_info->video_memkb == LIBXL_MEMKB_DEFAULT)
|
||||
b_info->video_memkb = 8 * 1024;
|
||||
@@ -264,6 +268,15 @@ int libxl__domain_build_info_setdefault(
|
||||
if (b_info->video_memkb == LIBXL_MEMKB_DEFAULT)
|
||||
b_info->video_memkb = 0;
|
||||
break;
|
||||
+ case LIBXL_VGA_INTERFACE_TYPE_QXL:
|
||||
+ if (b_info->video_memkb == LIBXL_MEMKB_DEFAULT) {
|
||||
+ b_info->video_memkb = (128 * 1024);
|
||||
+ } else if (b_info->video_memkb < (128 * 1024)) {
|
||||
+ LOG(ERROR,
|
||||
+ "128 Mib videoram is the minimum for qxl default");
|
||||
+ return ERROR_INVAL;
|
||||
+ }
|
||||
+ break;
|
||||
case LIBXL_VGA_INTERFACE_TYPE_STD:
|
||||
if (b_info->video_memkb == LIBXL_MEMKB_DEFAULT)
|
||||
b_info->video_memkb = 16 * 1024;
|
||||
Index: xen-4.5.1-testing/tools/libxl/libxl_dm.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/libxl/libxl_dm.c
|
||||
+++ xen-4.5.1-testing/tools/libxl/libxl_dm.c
|
||||
@@ -251,6 +251,8 @@ static char ** libxl__build_device_model
|
||||
case LIBXL_VGA_INTERFACE_TYPE_NONE:
|
||||
flexarray_append_pair(dm_args, "-vga", "none");
|
||||
break;
|
||||
+ case LIBXL_VGA_INTERFACE_TYPE_QXL:
|
||||
+ break;
|
||||
}
|
||||
|
||||
if (b_info->u.hvm.boot) {
|
||||
@@ -616,6 +618,12 @@ static char ** libxl__build_device_model
|
||||
break;
|
||||
case LIBXL_VGA_INTERFACE_TYPE_NONE:
|
||||
break;
|
||||
+ case LIBXL_VGA_INTERFACE_TYPE_QXL:
|
||||
+ /* QXL have 2 ram regions, ram and vram */
|
||||
+ flexarray_append_pair(dm_args, "-device",
|
||||
+ GCSPRINTF("qxl-vga,vram_size_mb=%"PRIu64",ram_size_mb=%"PRIu64,
|
||||
+ (b_info->video_memkb/2/1024), (b_info->video_memkb/2/1024) ) );
|
||||
+ break;
|
||||
}
|
||||
|
||||
if (b_info->u.hvm.boot) {
|
||||
Index: xen-4.5.1-testing/tools/libxl/libxl_types.idl
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/libxl/libxl_types.idl
|
||||
+++ xen-4.5.1-testing/tools/libxl/libxl_types.idl
|
||||
@@ -181,6 +181,7 @@ libxl_vga_interface_type = Enumeration("
|
||||
(1, "CIRRUS"),
|
||||
(2, "STD"),
|
||||
(3, "NONE"),
|
||||
+ (4, "QXL"),
|
||||
], init_val = "LIBXL_VGA_INTERFACE_TYPE_CIRRUS")
|
||||
|
||||
libxl_vendor_device = Enumeration("vendor_device", [
|
||||
Index: xen-4.5.1-testing/tools/libxl/xl_cmdimpl.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/libxl/xl_cmdimpl.c
|
||||
+++ xen-4.5.1-testing/tools/libxl/xl_cmdimpl.c
|
||||
@@ -1910,6 +1910,8 @@ skip_vfb:
|
||||
b_info->u.hvm.vga.kind = LIBXL_VGA_INTERFACE_TYPE_CIRRUS;
|
||||
} else if (!strcmp(buf, "none")) {
|
||||
b_info->u.hvm.vga.kind = LIBXL_VGA_INTERFACE_TYPE_NONE;
|
||||
+ } else if (!strcmp(buf, "qxl")) {
|
||||
+ b_info->u.hvm.vga.kind = LIBXL_VGA_INTERFACE_TYPE_QXL;
|
||||
} else {
|
||||
fprintf(stderr, "Unknown vga \"%s\" specified\n", buf);
|
||||
exit(1);
|
@ -1,65 +0,0 @@
|
||||
# Commit 3c694aec08dda782d9c866e599b848dff86f474f
|
||||
# Date 2015-05-13 15:00:58 +0200
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: provide add_sized()
|
||||
|
||||
add_sized(ptr, inc) adds inc to the value at ptr using only the correct
|
||||
size of loads and stores for the type of *ptr. The add is /not/ atomic.
|
||||
|
||||
This is needed for ticket locks to ensure the increment of the head ticket
|
||||
does not affect the tail ticket.
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
|
||||
--- a/xen/include/asm-x86/atomic.h
|
||||
+++ b/xen/include/asm-x86/atomic.h
|
||||
@@ -14,6 +14,14 @@ static inline void name(volatile type *a
|
||||
{ asm volatile("mov" size " %1,%0": "=m" (*(volatile type *)addr) \
|
||||
:reg (val) barrier); }
|
||||
|
||||
+#define build_add_sized(name, size, type, reg) \
|
||||
+ static inline void name(volatile type *addr, type val) \
|
||||
+ { \
|
||||
+ asm volatile("add" size " %1,%0" \
|
||||
+ : "=m" (*addr) \
|
||||
+ : reg (val)); \
|
||||
+ }
|
||||
+
|
||||
build_read_atomic(read_u8_atomic, "b", uint8_t, "=q", )
|
||||
build_read_atomic(read_u16_atomic, "w", uint16_t, "=r", )
|
||||
build_read_atomic(read_u32_atomic, "l", uint32_t, "=r", )
|
||||
@@ -25,8 +33,14 @@ build_write_atomic(write_u32_atomic, "l"
|
||||
build_read_atomic(read_u64_atomic, "q", uint64_t, "=r", )
|
||||
build_write_atomic(write_u64_atomic, "q", uint64_t, "r", )
|
||||
|
||||
+build_add_sized(add_u8_sized, "b", uint8_t, "qi")
|
||||
+build_add_sized(add_u16_sized, "w", uint16_t, "ri")
|
||||
+build_add_sized(add_u32_sized, "l", uint32_t, "ri")
|
||||
+build_add_sized(add_u64_sized, "q", uint64_t, "ri")
|
||||
+
|
||||
#undef build_read_atomic
|
||||
#undef build_write_atomic
|
||||
+#undef build_add_sized
|
||||
|
||||
void __bad_atomic_size(void);
|
||||
|
||||
@@ -54,6 +68,18 @@ void __bad_atomic_size(void);
|
||||
__x; \
|
||||
})
|
||||
|
||||
+#define add_sized(p, x) ({ \
|
||||
+ typeof(*(p)) x_ = (x); \
|
||||
+ switch ( sizeof(*(p)) ) \
|
||||
+ { \
|
||||
+ case 1: add_u8_sized((uint8_t *)(p), x_); break; \
|
||||
+ case 2: add_u16_sized((uint16_t *)(p), x_); break; \
|
||||
+ case 4: add_u32_sized((uint32_t *)(p), x_); break; \
|
||||
+ case 8: add_u64_sized((uint64_t *)(p), x_); break; \
|
||||
+ default: __bad_atomic_size(); break; \
|
||||
+ } \
|
||||
+})
|
||||
+
|
||||
/*
|
||||
* NB. I've pushed the volatile qualifier into the operations. This allows
|
||||
* fast accessors such as _atomic_read() and _atomic_set() which don't give
|
@ -1,64 +0,0 @@
|
||||
# Commit 890674d13feb4a270aa112ca452dcf62fdd53f34
|
||||
# Date 2015-05-13 15:01:25 +0200
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
arm: provide add_sized()
|
||||
|
||||
add_sized(ptr, inc) adds inc to the value at ptr using only the correct
|
||||
size of loads and stores for the type of *ptr. The add is /not/ atomic.
|
||||
|
||||
This is needed for ticket locks to ensure the increment of the head ticket
|
||||
does not affect the tail ticket.
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
Acked-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
|
||||
--- a/xen/include/asm-arm/atomic.h
|
||||
+++ b/xen/include/asm-arm/atomic.h
|
||||
@@ -23,6 +23,17 @@ static inline void name(volatile type *a
|
||||
: reg (val)); \
|
||||
}
|
||||
|
||||
+#define build_add_sized(name, size, width, type, reg) \
|
||||
+static inline void name(volatile type *addr, type val) \
|
||||
+{ \
|
||||
+ type t; \
|
||||
+ asm volatile("ldr" size " %"width"1,%0\n" \
|
||||
+ "add %"width"1,%"width"1,%"width"2\n" \
|
||||
+ "str" size " %"width"1,%0" \
|
||||
+ : "=m" (*(volatile type *)addr), "=r" (t) \
|
||||
+ : reg (val)); \
|
||||
+}
|
||||
+
|
||||
#if defined (CONFIG_ARM_32)
|
||||
#define BYTE ""
|
||||
#define WORD ""
|
||||
@@ -46,6 +57,10 @@ build_atomic_read(read_u64_atomic, "x",
|
||||
build_atomic_write(write_u64_atomic, "x", uint64_t, "r")
|
||||
#endif
|
||||
|
||||
+build_add_sized(add_u8_sized, "b", BYTE, uint8_t, "ri")
|
||||
+build_add_sized(add_u16_sized, "h", WORD, uint16_t, "ri")
|
||||
+build_add_sized(add_u32_sized, "", WORD, uint32_t, "ri")
|
||||
+
|
||||
void __bad_atomic_size(void);
|
||||
|
||||
#define read_atomic(p) ({ \
|
||||
@@ -70,6 +85,17 @@ void __bad_atomic_size(void);
|
||||
__x; \
|
||||
})
|
||||
|
||||
+#define add_sized(p, x) ({ \
|
||||
+ typeof(*(p)) __x = (x); \
|
||||
+ switch ( sizeof(*(p)) ) \
|
||||
+ { \
|
||||
+ case 1: add_u8_sized((uint8_t *)(p), __x); break; \
|
||||
+ case 2: add_u16_sized((uint16_t *)(p), __x); break; \
|
||||
+ case 4: add_u32_sized((uint32_t *)(p), __x); break; \
|
||||
+ default: __bad_atomic_size(); break; \
|
||||
+ } \
|
||||
+})
|
||||
+
|
||||
/*
|
||||
* NB. I've pushed the volatile qualifier into the operations. This allows
|
||||
* fast accessors such as _atomic_read() and _atomic_set() which don't give
|
@ -1,305 +0,0 @@
|
||||
# Commit 45fcc4568c5162b00fb3907fb158af82dd484a3d
|
||||
# Date 2015-05-15 09:49:12 +0200
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
use ticket locks for spin locks
|
||||
|
||||
Replace the byte locks with ticket locks. Ticket locks are: a) fair;
|
||||
and b) peform better when contented since they spin without an atomic
|
||||
operation.
|
||||
|
||||
The lock is split into two ticket values: head and tail. A locker
|
||||
acquires a ticket by (atomically) increasing tail and using the
|
||||
previous tail value. A CPU holds the lock if its ticket == head. The
|
||||
lock is released by increasing head.
|
||||
|
||||
spin_lock_irq() and spin_lock_irqsave() now spin with irqs disabled
|
||||
(previously, they would spin with irqs enabled if possible). This is
|
||||
required to prevent deadlocks when the irq handler tries to take the
|
||||
same lock with a higher ticket.
|
||||
|
||||
Architectures need only provide arch_fetch_and_add() and two barriers:
|
||||
arch_lock_acquire_barrier() and arch_lock_release_barrier().
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
Reviewed-by: Tim Deegan <tim@xen.org>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/xen/common/spinlock.c
|
||||
+++ b/xen/common/spinlock.c
|
||||
@@ -115,125 +115,134 @@ void spin_debug_disable(void)
|
||||
|
||||
#endif
|
||||
|
||||
+static always_inline spinlock_tickets_t observe_lock(spinlock_tickets_t *t)
|
||||
+{
|
||||
+ spinlock_tickets_t v;
|
||||
+
|
||||
+ smp_rmb();
|
||||
+ v.head_tail = read_atomic(&t->head_tail);
|
||||
+ return v;
|
||||
+}
|
||||
+
|
||||
+static always_inline u16 observe_head(spinlock_tickets_t *t)
|
||||
+{
|
||||
+ smp_rmb();
|
||||
+ return read_atomic(&t->head);
|
||||
+}
|
||||
+
|
||||
void _spin_lock(spinlock_t *lock)
|
||||
{
|
||||
+ spinlock_tickets_t tickets = SPINLOCK_TICKET_INC;
|
||||
LOCK_PROFILE_VAR;
|
||||
|
||||
check_lock(&lock->debug);
|
||||
- while ( unlikely(!_raw_spin_trylock(&lock->raw)) )
|
||||
+ tickets.head_tail = arch_fetch_and_add(&lock->tickets.head_tail,
|
||||
+ tickets.head_tail);
|
||||
+ while ( tickets.tail != observe_head(&lock->tickets) )
|
||||
{
|
||||
LOCK_PROFILE_BLOCK;
|
||||
- while ( likely(_raw_spin_is_locked(&lock->raw)) )
|
||||
- cpu_relax();
|
||||
+ cpu_relax();
|
||||
}
|
||||
LOCK_PROFILE_GOT;
|
||||
preempt_disable();
|
||||
+ arch_lock_acquire_barrier();
|
||||
}
|
||||
|
||||
void _spin_lock_irq(spinlock_t *lock)
|
||||
{
|
||||
- LOCK_PROFILE_VAR;
|
||||
-
|
||||
ASSERT(local_irq_is_enabled());
|
||||
local_irq_disable();
|
||||
- check_lock(&lock->debug);
|
||||
- while ( unlikely(!_raw_spin_trylock(&lock->raw)) )
|
||||
- {
|
||||
- LOCK_PROFILE_BLOCK;
|
||||
- local_irq_enable();
|
||||
- while ( likely(_raw_spin_is_locked(&lock->raw)) )
|
||||
- cpu_relax();
|
||||
- local_irq_disable();
|
||||
- }
|
||||
- LOCK_PROFILE_GOT;
|
||||
- preempt_disable();
|
||||
+ _spin_lock(lock);
|
||||
}
|
||||
|
||||
unsigned long _spin_lock_irqsave(spinlock_t *lock)
|
||||
{
|
||||
unsigned long flags;
|
||||
- LOCK_PROFILE_VAR;
|
||||
|
||||
local_irq_save(flags);
|
||||
- check_lock(&lock->debug);
|
||||
- while ( unlikely(!_raw_spin_trylock(&lock->raw)) )
|
||||
- {
|
||||
- LOCK_PROFILE_BLOCK;
|
||||
- local_irq_restore(flags);
|
||||
- while ( likely(_raw_spin_is_locked(&lock->raw)) )
|
||||
- cpu_relax();
|
||||
- local_irq_save(flags);
|
||||
- }
|
||||
- LOCK_PROFILE_GOT;
|
||||
- preempt_disable();
|
||||
+ _spin_lock(lock);
|
||||
return flags;
|
||||
}
|
||||
|
||||
void _spin_unlock(spinlock_t *lock)
|
||||
{
|
||||
+ arch_lock_release_barrier();
|
||||
preempt_enable();
|
||||
LOCK_PROFILE_REL;
|
||||
- _raw_spin_unlock(&lock->raw);
|
||||
+ add_sized(&lock->tickets.head, 1);
|
||||
}
|
||||
|
||||
void _spin_unlock_irq(spinlock_t *lock)
|
||||
{
|
||||
- preempt_enable();
|
||||
- LOCK_PROFILE_REL;
|
||||
- _raw_spin_unlock(&lock->raw);
|
||||
+ _spin_unlock(lock);
|
||||
local_irq_enable();
|
||||
}
|
||||
|
||||
void _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
|
||||
{
|
||||
- preempt_enable();
|
||||
- LOCK_PROFILE_REL;
|
||||
- _raw_spin_unlock(&lock->raw);
|
||||
+ _spin_unlock(lock);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
int _spin_is_locked(spinlock_t *lock)
|
||||
{
|
||||
check_lock(&lock->debug);
|
||||
- return _raw_spin_is_locked(&lock->raw);
|
||||
+ return lock->tickets.head != lock->tickets.tail;
|
||||
}
|
||||
|
||||
int _spin_trylock(spinlock_t *lock)
|
||||
{
|
||||
+ spinlock_tickets_t old, new;
|
||||
+
|
||||
check_lock(&lock->debug);
|
||||
- if ( !_raw_spin_trylock(&lock->raw) )
|
||||
+ old = observe_lock(&lock->tickets);
|
||||
+ if ( old.head != old.tail )
|
||||
+ return 0;
|
||||
+ new = old;
|
||||
+ new.tail++;
|
||||
+ if ( cmpxchg(&lock->tickets.head_tail,
|
||||
+ old.head_tail, new.head_tail) != old.head_tail )
|
||||
return 0;
|
||||
#ifdef LOCK_PROFILE
|
||||
if (lock->profile)
|
||||
lock->profile->time_locked = NOW();
|
||||
#endif
|
||||
preempt_disable();
|
||||
+ /*
|
||||
+ * cmpxchg() is a full barrier so no need for an
|
||||
+ * arch_lock_acquire_barrier().
|
||||
+ */
|
||||
return 1;
|
||||
}
|
||||
|
||||
void _spin_barrier(spinlock_t *lock)
|
||||
{
|
||||
+ spinlock_tickets_t sample;
|
||||
#ifdef LOCK_PROFILE
|
||||
s_time_t block = NOW();
|
||||
- u64 loop = 0;
|
||||
+#endif
|
||||
|
||||
check_barrier(&lock->debug);
|
||||
- do { smp_mb(); loop++;} while ( _raw_spin_is_locked(&lock->raw) );
|
||||
- if ((loop > 1) && lock->profile)
|
||||
+ smp_mb();
|
||||
+ sample = observe_lock(&lock->tickets);
|
||||
+ if ( sample.head != sample.tail )
|
||||
{
|
||||
- lock->profile->time_block += NOW() - block;
|
||||
- lock->profile->block_cnt++;
|
||||
- }
|
||||
-#else
|
||||
- check_barrier(&lock->debug);
|
||||
- do { smp_mb(); } while ( _raw_spin_is_locked(&lock->raw) );
|
||||
+ while ( observe_head(&lock->tickets) == sample.head )
|
||||
+ cpu_relax();
|
||||
+#ifdef LOCK_PROFILE
|
||||
+ if ( lock->profile )
|
||||
+ {
|
||||
+ lock->profile->time_block += NOW() - block;
|
||||
+ lock->profile->block_cnt++;
|
||||
+ }
|
||||
#endif
|
||||
+ }
|
||||
smp_mb();
|
||||
}
|
||||
|
||||
int _spin_trylock_recursive(spinlock_t *lock)
|
||||
{
|
||||
- int cpu = smp_processor_id();
|
||||
+ unsigned int cpu = smp_processor_id();
|
||||
|
||||
/* Don't allow overflow of recurse_cpu field. */
|
||||
BUILD_BUG_ON(NR_CPUS > 0xfffu);
|
||||
@@ -256,8 +265,17 @@ int _spin_trylock_recursive(spinlock_t *
|
||||
|
||||
void _spin_lock_recursive(spinlock_t *lock)
|
||||
{
|
||||
- while ( !spin_trylock_recursive(lock) )
|
||||
- cpu_relax();
|
||||
+ unsigned int cpu = smp_processor_id();
|
||||
+
|
||||
+ if ( likely(lock->recurse_cpu != cpu) )
|
||||
+ {
|
||||
+ _spin_lock(lock);
|
||||
+ lock->recurse_cpu = cpu;
|
||||
+ }
|
||||
+
|
||||
+ /* We support only fairly shallow recursion, else the counter overflows. */
|
||||
+ ASSERT(lock->recurse_cnt < 0xfu);
|
||||
+ lock->recurse_cnt++;
|
||||
}
|
||||
|
||||
void _spin_unlock_recursive(spinlock_t *lock)
|
||||
--- a/xen/include/asm-arm/system.h
|
||||
+++ b/xen/include/asm-arm/system.h
|
||||
@@ -53,6 +53,9 @@
|
||||
|
||||
#define arch_fetch_and_add(x, v) __sync_fetch_and_add(x, v)
|
||||
|
||||
+#define arch_lock_acquire_barrier() smp_mb()
|
||||
+#define arch_lock_release_barrier() smp_mb()
|
||||
+
|
||||
extern struct vcpu *__context_switch(struct vcpu *prev, struct vcpu *next);
|
||||
|
||||
#endif
|
||||
--- a/xen/include/asm-x86/system.h
|
||||
+++ b/xen/include/asm-x86/system.h
|
||||
@@ -185,6 +185,17 @@ static always_inline unsigned long __xad
|
||||
#define set_mb(var, value) do { xchg(&var, value); } while (0)
|
||||
#define set_wmb(var, value) do { var = value; wmb(); } while (0)
|
||||
|
||||
+/*
|
||||
+ * On x86 the only reordering is of reads with older writes. In the
|
||||
+ * lock case, the read in observe_head() can only be reordered with
|
||||
+ * writes that precede it, and moving a write _into_ a locked section
|
||||
+ * is OK. In the release case, the write in add_sized() can only be
|
||||
+ * reordered with reads that follow it, and hoisting a read _into_ a
|
||||
+ * locked region is OK.
|
||||
+ */
|
||||
+#define arch_lock_acquire_barrier() barrier()
|
||||
+#define arch_lock_release_barrier() barrier()
|
||||
+
|
||||
#define local_irq_disable() asm volatile ( "cli" : : : "memory" )
|
||||
#define local_irq_enable() asm volatile ( "sti" : : : "memory" )
|
||||
|
||||
--- a/xen/include/xen/spinlock.h
|
||||
+++ b/xen/include/xen/spinlock.h
|
||||
@@ -80,8 +80,7 @@ struct lock_profile_qhead {
|
||||
static struct lock_profile *__lock_profile_##name \
|
||||
__used_section(".lockprofile.data") = \
|
||||
&__lock_profile_data_##name
|
||||
-#define _SPIN_LOCK_UNLOCKED(x) { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0, \
|
||||
- _LOCK_DEBUG, x }
|
||||
+#define _SPIN_LOCK_UNLOCKED(x) { { 0 }, 0xfffu, 0, _LOCK_DEBUG, x }
|
||||
#define SPIN_LOCK_UNLOCKED _SPIN_LOCK_UNLOCKED(NULL)
|
||||
#define DEFINE_SPINLOCK(l) \
|
||||
spinlock_t l = _SPIN_LOCK_UNLOCKED(NULL); \
|
||||
@@ -117,8 +116,7 @@ extern void spinlock_profile_reset(unsig
|
||||
|
||||
struct lock_profile_qhead { };
|
||||
|
||||
-#define SPIN_LOCK_UNLOCKED \
|
||||
- { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0, _LOCK_DEBUG }
|
||||
+#define SPIN_LOCK_UNLOCKED { { 0 }, 0xfffu, 0, _LOCK_DEBUG }
|
||||
#define DEFINE_SPINLOCK(l) spinlock_t l = SPIN_LOCK_UNLOCKED
|
||||
|
||||
#define spin_lock_init_prof(s, l) spin_lock_init(&((s)->l))
|
||||
@@ -127,8 +125,18 @@ struct lock_profile_qhead { };
|
||||
|
||||
#endif
|
||||
|
||||
+typedef union {
|
||||
+ u32 head_tail;
|
||||
+ struct {
|
||||
+ u16 head;
|
||||
+ u16 tail;
|
||||
+ };
|
||||
+} spinlock_tickets_t;
|
||||
+
|
||||
+#define SPINLOCK_TICKET_INC { .head_tail = 0x10000, }
|
||||
+
|
||||
typedef struct spinlock {
|
||||
- raw_spinlock_t raw;
|
||||
+ spinlock_tickets_t tickets;
|
||||
u16 recurse_cpu:12;
|
||||
u16 recurse_cnt:4;
|
||||
struct lock_debug debug;
|
@ -1,266 +0,0 @@
|
||||
# Commit e62e49e6d5d4e8d22f3df0b75443ede65a812435
|
||||
# Date 2015-05-15 09:52:25 +0200
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86,arm: remove asm/spinlock.h from all architectures
|
||||
|
||||
Now that all architecture use a common ticket lock implementation for
|
||||
spinlocks, remove the architecture specific byte lock implementations.
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
Reviewed-by: Tim Deegan <tim@xen.org>
|
||||
Acked-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
|
||||
--- a/xen/arch/arm/README.LinuxPrimitives
|
||||
+++ b/xen/arch/arm/README.LinuxPrimitives
|
||||
@@ -25,16 +25,6 @@ linux/arch/arm64/include/asm/atomic.h
|
||||
|
||||
---------------------------------------------------------------------
|
||||
|
||||
-spinlocks: last sync @ v3.16-rc6 (last commit: 95c4189689f9)
|
||||
-
|
||||
-linux/arch/arm64/include/asm/spinlock.h xen/include/asm-arm/arm64/spinlock.h
|
||||
-
|
||||
-Skipped:
|
||||
- 5686b06 arm64: lockref: add support for lockless lockrefs using cmpxchg
|
||||
- 52ea2a5 arm64: locks: introduce ticket-based spinlock implementation
|
||||
-
|
||||
----------------------------------------------------------------------
|
||||
-
|
||||
mem*: last sync @ v3.16-rc6 (last commit: d875c9b37240)
|
||||
|
||||
linux/arch/arm64/lib/memchr.S xen/arch/arm/arm64/lib/memchr.S
|
||||
@@ -103,24 +93,6 @@ linux/arch/arm/include/asm/atomic.h
|
||||
|
||||
---------------------------------------------------------------------
|
||||
|
||||
-spinlocks: last sync: 15e7e5c1ebf5
|
||||
-
|
||||
-linux/arch/arm/include/asm/spinlock.h xen/include/asm-arm/arm32/spinlock.h
|
||||
-
|
||||
-*** Linux has switched to ticket locks but we still use bitlocks.
|
||||
-
|
||||
-resync to v3.14-rc7:
|
||||
-
|
||||
- 7c8746a ARM: 7955/1: spinlock: ensure we have a compiler barrier before sev
|
||||
- 0cbad9c ARM: 7854/1: lockref: add support for lockless lockrefs using cmpxchg64
|
||||
- 9bb17be ARM: locks: prefetch the destination word for write prior to strex
|
||||
- 27a8479 ARM: smp_on_up: move inline asm ALT_SMP patching macro out of spinlock.
|
||||
- 00efaa0 ARM: 7812/1: rwlocks: retry trylock operation if strex fails on free lo
|
||||
- afa31d8 ARM: 7811/1: locks: use early clobber in arch_spin_trylock
|
||||
- 73a6fdc ARM: spinlock: use inner-shareable dsb variant prior to sev instruction
|
||||
-
|
||||
----------------------------------------------------------------------
|
||||
-
|
||||
mem*: last sync @ v3.16-rc6 (last commit: d98b90ea22b0)
|
||||
|
||||
linux/arch/arm/lib/copy_template.S xen/arch/arm/arm32/lib/copy_template.S
|
||||
--- a/xen/include/asm-arm/arm32/spinlock.h
|
||||
+++ /dev/null
|
||||
@@ -1,66 +0,0 @@
|
||||
-#ifndef __ASM_ARM32_SPINLOCK_H
|
||||
-#define __ASM_ARM32_SPINLOCK_H
|
||||
-
|
||||
-static inline void dsb_sev(void)
|
||||
-{
|
||||
- __asm__ __volatile__ (
|
||||
- "dsb\n"
|
||||
- "sev\n"
|
||||
- );
|
||||
-}
|
||||
-
|
||||
-typedef struct {
|
||||
- volatile unsigned int lock;
|
||||
-} raw_spinlock_t;
|
||||
-
|
||||
-#define _RAW_SPIN_LOCK_UNLOCKED { 0 }
|
||||
-
|
||||
-#define _raw_spin_is_locked(x) ((x)->lock != 0)
|
||||
-
|
||||
-static always_inline void _raw_spin_unlock(raw_spinlock_t *lock)
|
||||
-{
|
||||
- ASSERT(_raw_spin_is_locked(lock));
|
||||
-
|
||||
- smp_mb();
|
||||
-
|
||||
- __asm__ __volatile__(
|
||||
-" str %1, [%0]\n"
|
||||
- :
|
||||
- : "r" (&lock->lock), "r" (0)
|
||||
- : "cc");
|
||||
-
|
||||
- dsb_sev();
|
||||
-}
|
||||
-
|
||||
-static always_inline int _raw_spin_trylock(raw_spinlock_t *lock)
|
||||
-{
|
||||
- unsigned long contended, res;
|
||||
-
|
||||
- do {
|
||||
- __asm__ __volatile__(
|
||||
- " ldrex %0, [%2]\n"
|
||||
- " teq %0, #0\n"
|
||||
- " strexeq %1, %3, [%2]\n"
|
||||
- " movne %1, #0\n"
|
||||
- : "=&r" (contended), "=r" (res)
|
||||
- : "r" (&lock->lock), "r" (1)
|
||||
- : "cc");
|
||||
- } while (res);
|
||||
-
|
||||
- if (!contended) {
|
||||
- smp_mb();
|
||||
- return 1;
|
||||
- } else {
|
||||
- return 0;
|
||||
- }
|
||||
-}
|
||||
-
|
||||
-#endif /* __ASM_SPINLOCK_H */
|
||||
-/*
|
||||
- * Local variables:
|
||||
- * mode: C
|
||||
- * c-file-style: "BSD"
|
||||
- * c-basic-offset: 4
|
||||
- * indent-tabs-mode: nil
|
||||
- * End:
|
||||
- */
|
||||
--- a/xen/include/asm-arm/arm64/spinlock.h
|
||||
+++ /dev/null
|
||||
@@ -1,63 +0,0 @@
|
||||
-/*
|
||||
- * Derived from Linux arch64 spinlock.h which is:
|
||||
- * Copyright (C) 2012 ARM Ltd.
|
||||
- *
|
||||
- * This program is free software; you can redistribute it and/or modify
|
||||
- * it under the terms of the GNU General Public License version 2 as
|
||||
- * published by the Free Software Foundation.
|
||||
- *
|
||||
- * This program is distributed in the hope that it will be useful,
|
||||
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
- * GNU General Public License for more details.
|
||||
- *
|
||||
- * You should have received a copy of the GNU General Public License
|
||||
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
- */
|
||||
-
|
||||
-#ifndef __ASM_ARM64_SPINLOCK_H
|
||||
-#define __ASM_ARM64_SPINLOCK_H
|
||||
-
|
||||
-typedef struct {
|
||||
- volatile unsigned int lock;
|
||||
-} raw_spinlock_t;
|
||||
-
|
||||
-#define _RAW_SPIN_LOCK_UNLOCKED { 0 }
|
||||
-
|
||||
-#define _raw_spin_is_locked(x) ((x)->lock != 0)
|
||||
-
|
||||
-static always_inline void _raw_spin_unlock(raw_spinlock_t *lock)
|
||||
-{
|
||||
- ASSERT(_raw_spin_is_locked(lock));
|
||||
-
|
||||
- asm volatile(
|
||||
- " stlr %w1, %0\n"
|
||||
- : "=Q" (lock->lock) : "r" (0) : "memory");
|
||||
-}
|
||||
-
|
||||
-static always_inline int _raw_spin_trylock(raw_spinlock_t *lock)
|
||||
-{
|
||||
- unsigned int tmp;
|
||||
-
|
||||
- asm volatile(
|
||||
- "2: ldaxr %w0, %1\n"
|
||||
- " cbnz %w0, 1f\n"
|
||||
- " stxr %w0, %w2, %1\n"
|
||||
- " cbnz %w0, 2b\n"
|
||||
- "1:\n"
|
||||
- : "=&r" (tmp), "+Q" (lock->lock)
|
||||
- : "r" (1)
|
||||
- : "cc", "memory");
|
||||
-
|
||||
- return !tmp;
|
||||
-}
|
||||
-
|
||||
-#endif /* __ASM_SPINLOCK_H */
|
||||
-/*
|
||||
- * Local variables:
|
||||
- * mode: C
|
||||
- * c-file-style: "BSD"
|
||||
- * c-basic-offset: 4
|
||||
- * indent-tabs-mode: nil
|
||||
- * End:
|
||||
- */
|
||||
--- a/xen/include/asm-arm/spinlock.h
|
||||
+++ /dev/null
|
||||
@@ -1,23 +0,0 @@
|
||||
-#ifndef __ASM_SPINLOCK_H
|
||||
-#define __ASM_SPINLOCK_H
|
||||
-
|
||||
-#include <xen/config.h>
|
||||
-#include <xen/lib.h>
|
||||
-
|
||||
-#if defined(CONFIG_ARM_32)
|
||||
-# include <asm/arm32/spinlock.h>
|
||||
-#elif defined(CONFIG_ARM_64)
|
||||
-# include <asm/arm64/spinlock.h>
|
||||
-#else
|
||||
-# error "unknown ARM variant"
|
||||
-#endif
|
||||
-
|
||||
-#endif /* __ASM_SPINLOCK_H */
|
||||
-/*
|
||||
- * Local variables:
|
||||
- * mode: C
|
||||
- * c-file-style: "BSD"
|
||||
- * c-basic-offset: 4
|
||||
- * indent-tabs-mode: nil
|
||||
- * End:
|
||||
- */
|
||||
--- a/xen/include/asm-x86/spinlock.h
|
||||
+++ /dev/null
|
||||
@@ -1,34 +0,0 @@
|
||||
-#ifndef __ASM_SPINLOCK_H
|
||||
-#define __ASM_SPINLOCK_H
|
||||
-
|
||||
-#include <xen/config.h>
|
||||
-#include <xen/lib.h>
|
||||
-#include <asm/atomic.h>
|
||||
-
|
||||
-typedef struct {
|
||||
- volatile s16 lock;
|
||||
-} raw_spinlock_t;
|
||||
-
|
||||
-#define _RAW_SPIN_LOCK_UNLOCKED /*(raw_spinlock_t)*/ { 1 }
|
||||
-
|
||||
-#define _raw_spin_is_locked(x) ((x)->lock <= 0)
|
||||
-
|
||||
-static always_inline void _raw_spin_unlock(raw_spinlock_t *lock)
|
||||
-{
|
||||
- ASSERT(_raw_spin_is_locked(lock));
|
||||
- asm volatile (
|
||||
- "movw $1,%0"
|
||||
- : "=m" (lock->lock) : : "memory" );
|
||||
-}
|
||||
-
|
||||
-static always_inline int _raw_spin_trylock(raw_spinlock_t *lock)
|
||||
-{
|
||||
- s16 oldval;
|
||||
- asm volatile (
|
||||
- "xchgw %w0,%1"
|
||||
- :"=r" (oldval), "=m" (lock->lock)
|
||||
- :"0" ((s16)0) : "memory" );
|
||||
- return (oldval > 0);
|
||||
-}
|
||||
-
|
||||
-#endif /* __ASM_SPINLOCK_H */
|
||||
--- a/xen/include/xen/spinlock.h
|
||||
+++ b/xen/include/xen/spinlock.h
|
||||
@@ -2,7 +2,6 @@
|
||||
#define __SPINLOCK_H__
|
||||
|
||||
#include <asm/system.h>
|
||||
-#include <asm/spinlock.h>
|
||||
|
||||
#ifndef NDEBUG
|
||||
struct lock_debug {
|
@ -1,141 +0,0 @@
|
||||
# Commit f278fcf19ce15f7b7ee69181560b5884a5e12b66
|
||||
# Date 2015-05-15 10:06:04 +0200
|
||||
# Author Roger Pau Monné <roger.pau@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
introduce a helper to allocate non-contiguous memory
|
||||
|
||||
The allocator uses independent calls to alloc_domheap_pages in order to get
|
||||
the desired amount of memory and then maps all the independent physical
|
||||
addresses into a contiguous virtual address space.
|
||||
|
||||
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
|
||||
Tested-by: Julien Grall <julien.grall@citrix.com> (ARM)
|
||||
Reviewed-by: Tim Deegan <tim@xen.org>
|
||||
|
||||
# Commit 640f891eb258563bb155e577389e8c5e6541a59a
|
||||
# Date 2015-05-21 08:57:19 +0200
|
||||
# Author Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
vmap: avoid hitting an ASSERT with vfree(NULL)
|
||||
|
||||
and unconditionally defer the vm_size() call, as it doesn't have a NULL
|
||||
short circuit.
|
||||
|
||||
Reported-by: Wei Liu <wei.liu2@citrix.com>
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Tested-by: Wei Liu <wei.liu2@citrix.com>
|
||||
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
|
||||
Acked-by: Tim Deegan <tim@xen.org>
|
||||
|
||||
--- a/xen/common/vmap.c
|
||||
+++ b/xen/common/vmap.c
|
||||
@@ -215,4 +215,75 @@ void vunmap(const void *va)
|
||||
#endif
|
||||
vm_free(va);
|
||||
}
|
||||
+
|
||||
+void *vmalloc(size_t size)
|
||||
+{
|
||||
+ unsigned long *mfn;
|
||||
+ size_t pages, i;
|
||||
+ struct page_info *pg;
|
||||
+ void *va;
|
||||
+
|
||||
+ ASSERT(size);
|
||||
+
|
||||
+ pages = PFN_UP(size);
|
||||
+ mfn = xmalloc_array(unsigned long, pages);
|
||||
+ if ( mfn == NULL )
|
||||
+ return NULL;
|
||||
+
|
||||
+ for ( i = 0; i < pages; i++ )
|
||||
+ {
|
||||
+ pg = alloc_domheap_page(NULL, 0);
|
||||
+ if ( pg == NULL )
|
||||
+ goto error;
|
||||
+ mfn[i] = page_to_mfn(pg);
|
||||
+ }
|
||||
+
|
||||
+ va = vmap(mfn, pages);
|
||||
+ if ( va == NULL )
|
||||
+ goto error;
|
||||
+
|
||||
+ xfree(mfn);
|
||||
+ return va;
|
||||
+
|
||||
+ error:
|
||||
+ while ( i-- )
|
||||
+ free_domheap_page(mfn_to_page(mfn[i]));
|
||||
+ xfree(mfn);
|
||||
+ return NULL;
|
||||
+}
|
||||
+
|
||||
+void *vzalloc(size_t size)
|
||||
+{
|
||||
+ void *p = vmalloc(size);
|
||||
+ int i;
|
||||
+
|
||||
+ if ( p == NULL )
|
||||
+ return NULL;
|
||||
+
|
||||
+ for ( i = 0; i < size; i += PAGE_SIZE )
|
||||
+ clear_page(p + i);
|
||||
+
|
||||
+ return p;
|
||||
+}
|
||||
+
|
||||
+void vfree(void *va)
|
||||
+{
|
||||
+ unsigned int i, pages;
|
||||
+ struct page_info *pg;
|
||||
+ PAGE_LIST_HEAD(pg_list);
|
||||
+
|
||||
+ if ( !va )
|
||||
+ return;
|
||||
+
|
||||
+ pages = vm_size(va);
|
||||
+ ASSERT(pages);
|
||||
+
|
||||
+ for ( i = 0; i < pages; i++ )
|
||||
+ page_list_add(vmap_to_page(va + i * PAGE_SIZE), &pg_list);
|
||||
+
|
||||
+ vunmap(va);
|
||||
+
|
||||
+ while ( (pg = page_list_remove_head(&pg_list)) != NULL )
|
||||
+ free_domheap_page(pg);
|
||||
+}
|
||||
#endif
|
||||
--- a/xen/include/asm-arm/mm.h
|
||||
+++ b/xen/include/asm-arm/mm.h
|
||||
@@ -208,6 +208,8 @@ static inline void __iomem *ioremap_wc(p
|
||||
#define pfn_to_paddr(pfn) ((paddr_t)(pfn) << PAGE_SHIFT)
|
||||
#define paddr_to_pfn(pa) ((unsigned long)((pa) >> PAGE_SHIFT))
|
||||
#define paddr_to_pdx(pa) pfn_to_pdx(paddr_to_pfn(pa))
|
||||
+#define vmap_to_mfn(va) paddr_to_pfn(virt_to_maddr((vaddr_t)va))
|
||||
+#define vmap_to_page(va) mfn_to_page(vmap_to_mfn(va))
|
||||
|
||||
/* Page-align address and convert to frame number format */
|
||||
#define paddr_to_pfn_aligned(paddr) paddr_to_pfn(PAGE_ALIGN(paddr))
|
||||
--- a/xen/include/asm-x86/page.h
|
||||
+++ b/xen/include/asm-x86/page.h
|
||||
@@ -262,6 +262,8 @@ void copy_page_sse2(void *, const void *
|
||||
#define pfn_to_paddr(pfn) __pfn_to_paddr(pfn)
|
||||
#define paddr_to_pfn(pa) __paddr_to_pfn(pa)
|
||||
#define paddr_to_pdx(pa) pfn_to_pdx(paddr_to_pfn(pa))
|
||||
+#define vmap_to_mfn(va) l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va)))
|
||||
+#define vmap_to_page(va) mfn_to_page(vmap_to_mfn(va))
|
||||
|
||||
#endif /* !defined(__ASSEMBLY__) */
|
||||
|
||||
--- a/xen/include/xen/vmap.h
|
||||
+++ b/xen/include/xen/vmap.h
|
||||
@@ -11,6 +11,9 @@ void *__vmap(const unsigned long *mfn, u
|
||||
unsigned int nr, unsigned int align, unsigned int flags);
|
||||
void *vmap(const unsigned long *mfn, unsigned int nr);
|
||||
void vunmap(const void *);
|
||||
+void *vmalloc(size_t size);
|
||||
+void *vzalloc(size_t size);
|
||||
+void vfree(void *va);
|
||||
|
||||
void __iomem *ioremap(paddr_t, size_t);
|
||||
|
@ -1,29 +0,0 @@
|
||||
# Commit fed56ba0e69b251d0222ef0785cd1c1838f9e51d
|
||||
# Date 2015-06-02 13:45:03 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
unmodified-drivers: tolerate IRQF_DISABLED being undefined
|
||||
|
||||
It's being removed in Linux 4.1.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
|
||||
--- a/unmodified_drivers/linux-2.6/platform-pci/evtchn.c
|
||||
+++ b/unmodified_drivers/linux-2.6/platform-pci/evtchn.c
|
||||
@@ -350,11 +350,13 @@ int xen_irq_init(struct pci_dev *pdev)
|
||||
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
|
||||
SA_SHIRQ | SA_SAMPLE_RANDOM | SA_INTERRUPT,
|
||||
#else
|
||||
- IRQF_SHARED |
|
||||
#ifdef IRQF_SAMPLE_RANDOM
|
||||
IRQF_SAMPLE_RANDOM |
|
||||
#endif
|
||||
- IRQF_DISABLED,
|
||||
+#ifdef IRQF_DISABLED
|
||||
+ IRQF_DISABLED |
|
||||
+#endif
|
||||
+ IRQF_SHARED,
|
||||
#endif
|
||||
"xen-platform-pci", pdev);
|
||||
}
|
@ -1,158 +0,0 @@
|
||||
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
|
||||
|
||||
# Commit 85baced14dec2fafa9fe560969dba2ae28e8bebb
|
||||
# Date 2015-06-09 15:59:31 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: adjust PV I/O emulation functions' types
|
||||
|
||||
admin_io_okay(), guest_io_read(), and guest_io_write() all don't need
|
||||
their current "regs" parameter at all, and they don't use the vCPU
|
||||
passed to them for other than obtaining its domain. Drop the former and
|
||||
replace the latter by a struct domain pointer.
|
||||
|
||||
pci_cfg_okay() returns a boolean type, and its "write" parameter is of
|
||||
boolean kind too.
|
||||
|
||||
All of them get called for the current vCPU (and hence current domain)
|
||||
only, so name the domain parameters accordingly except in the
|
||||
admin_io_okay() case, which a subsequent patch will use for simplifying
|
||||
setup_io_bitmap().
|
||||
|
||||
Latch current->domain into a local variable in emulate_privileged_op().
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
# Commit 2d67a7a4d37a4759bcd7f2ee2d740497ad669c7d
|
||||
# Date 2015-06-18 15:07:10 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: synchronize PCI config space access decoding
|
||||
|
||||
Both PV and HVM logic have similar but not similar enough code here.
|
||||
Synchronize the two so that
|
||||
- in the HVM case we don't unconditionally try to access extended
|
||||
config space
|
||||
- in the PV case we pass a correct range to the XSM hook
|
||||
- in the PV case we don't needlessly deny access when the operation
|
||||
isn't really on PCI config space
|
||||
All this along with sharing the macros HVM already had here.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
Backport stripped down to just the pci_cfg_ok() adjustments.
|
||||
|
||||
--- a/xen/arch/x86/traps.c
|
||||
+++ b/xen/arch/x86/traps.c
|
||||
@@ -1708,14 +1708,18 @@ static int admin_io_okay(
|
||||
return ioports_access_permitted(v->domain, port, port + bytes - 1);
|
||||
}
|
||||
|
||||
-static int pci_cfg_ok(struct domain *d, int write, int size)
|
||||
+static bool_t pci_cfg_ok(struct domain *currd, bool_t write,
|
||||
+ unsigned int start, unsigned int size)
|
||||
{
|
||||
uint32_t machine_bdf;
|
||||
- uint16_t start, end;
|
||||
- if (!is_hardware_domain(d))
|
||||
+
|
||||
+ if ( !is_hardware_domain(currd) )
|
||||
return 0;
|
||||
|
||||
- machine_bdf = (d->arch.pci_cf8 >> 8) & 0xFFFF;
|
||||
+ if ( !CF8_ENABLED(currd->arch.pci_cf8) )
|
||||
+ return 1;
|
||||
+
|
||||
+ machine_bdf = CF8_BDF(currd->arch.pci_cf8);
|
||||
if ( write )
|
||||
{
|
||||
const unsigned long *ro_map = pci_get_ro_map(0);
|
||||
@@ -1723,9 +1727,9 @@ static int pci_cfg_ok(struct domain *d,
|
||||
if ( ro_map && test_bit(machine_bdf, ro_map) )
|
||||
return 0;
|
||||
}
|
||||
- start = d->arch.pci_cf8 & 0xFF;
|
||||
+ start |= CF8_ADDR_LO(currd->arch.pci_cf8);
|
||||
/* AMD extended configuration space access? */
|
||||
- if ( (d->arch.pci_cf8 & 0x0F000000) &&
|
||||
+ if ( CF8_ADDR_HI(currd->arch.pci_cf8) &&
|
||||
boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
|
||||
boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 <= 0x17 )
|
||||
{
|
||||
@@ -1734,12 +1738,11 @@ static int pci_cfg_ok(struct domain *d,
|
||||
if ( rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) )
|
||||
return 0;
|
||||
if ( msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT) )
|
||||
- start |= (d->arch.pci_cf8 >> 16) & 0xF00;
|
||||
+ start |= CF8_ADDR_HI(currd->arch.pci_cf8);
|
||||
}
|
||||
- end = start + size - 1;
|
||||
- if (xsm_pci_config_permission(XSM_HOOK, d, machine_bdf, start, end, write))
|
||||
- return 0;
|
||||
- return 1;
|
||||
+
|
||||
+ return !xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf,
|
||||
+ start, start + size - 1, write);
|
||||
}
|
||||
|
||||
uint32_t guest_io_read(
|
||||
@@ -1793,7 +1796,7 @@ uint32_t guest_io_read(
|
||||
size = min(bytes, 4 - (port & 3));
|
||||
if ( size == 3 )
|
||||
size = 2;
|
||||
- if ( pci_cfg_ok(v->domain, 0, size) )
|
||||
+ if ( pci_cfg_ok(v->domain, 0, port & 3, size) )
|
||||
sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size);
|
||||
}
|
||||
|
||||
@@ -1866,7 +1869,7 @@ void guest_io_write(
|
||||
size = min(bytes, 4 - (port & 3));
|
||||
if ( size == 3 )
|
||||
size = 2;
|
||||
- if ( pci_cfg_ok(v->domain, 1, size) )
|
||||
+ if ( pci_cfg_ok(v->domain, 1, port & 3, size) )
|
||||
pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data);
|
||||
}
|
||||
|
||||
--- a/xen/arch/x86/hvm/hvm.c
|
||||
+++ b/xen/arch/x86/hvm/hvm.c
|
||||
@@ -2357,11 +2357,6 @@ void hvm_vcpu_down(struct vcpu *v)
|
||||
static struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
|
||||
ioreq_t *p)
|
||||
{
|
||||
-#define CF8_BDF(cf8) (((cf8) & 0x00ffff00) >> 8)
|
||||
-#define CF8_ADDR_LO(cf8) ((cf8) & 0x000000fc)
|
||||
-#define CF8_ADDR_HI(cf8) (((cf8) & 0x0f000000) >> 16)
|
||||
-#define CF8_ENABLED(cf8) (!!((cf8) & 0x80000000))
|
||||
-
|
||||
struct hvm_ioreq_server *s;
|
||||
uint32_t cf8;
|
||||
uint8_t type;
|
||||
@@ -2446,11 +2441,6 @@ static struct hvm_ioreq_server *hvm_sele
|
||||
}
|
||||
|
||||
return d->arch.hvm_domain.default_ioreq_server;
|
||||
-
|
||||
-#undef CF8_ADDR_ENABLED
|
||||
-#undef CF8_ADDR_HI
|
||||
-#undef CF8_ADDR_LO
|
||||
-#undef CF8_BDF
|
||||
}
|
||||
|
||||
int hvm_buffered_io_send(ioreq_t *p)
|
||||
--- a/xen/include/asm-x86/pci.h
|
||||
+++ b/xen/include/asm-x86/pci.h
|
||||
@@ -1,6 +1,11 @@
|
||||
#ifndef __X86_PCI_H__
|
||||
#define __X86_PCI_H__
|
||||
|
||||
+#define CF8_BDF(cf8) ( ((cf8) & 0x00ffff00) >> 8)
|
||||
+#define CF8_ADDR_LO(cf8) ( (cf8) & 0x000000fc)
|
||||
+#define CF8_ADDR_HI(cf8) ( ((cf8) & 0x0f000000) >> 16)
|
||||
+#define CF8_ENABLED(cf8) (!!((cf8) & 0x80000000))
|
||||
+
|
||||
#define IS_SNB_GFX(id) (id == 0x01068086 || id == 0x01168086 \
|
||||
|| id == 0x01268086 || id == 0x01028086 \
|
||||
|| id == 0x01128086 || id == 0x01228086 \
|
@ -1,62 +0,0 @@
|
||||
References: bsc#925466
|
||||
|
||||
# Commit 5cb57f4bddee1f11079e69bf43c193a8b104c476
|
||||
# Date 2015-06-09 16:00:24 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
kexec: add more pages to v1 environment
|
||||
|
||||
Destination pages need mappings to be added to the page tables in the
|
||||
v1 case (where nothing else calls machine_kexec_add_page() for them).
|
||||
|
||||
Further, without the tools mapping the low 1Mb (expected by at least
|
||||
some Linux version), we need to do so in the hypervisor in the v1 case.
|
||||
|
||||
Suggested-by: David Vrabel <david.vrabel@citrix.com>
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Tested-by: Alan Robinson <alan.robinson@ts.fujitsu.com>
|
||||
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
|
||||
--- a/xen/common/kexec.c
|
||||
+++ b/xen/common/kexec.c
|
||||
@@ -1003,6 +1003,24 @@ static int kexec_do_load_v1(xen_kexec_lo
|
||||
if ( ret < 0 )
|
||||
goto error;
|
||||
|
||||
+ if ( arch == EM_386 || arch == EM_X86_64 )
|
||||
+ {
|
||||
+ /*
|
||||
+ * Ensure 0 - 1 MiB is mapped and accessible by the image.
|
||||
+ *
|
||||
+ * This allows access to VGA memory and the region purgatory copies
|
||||
+ * in the crash case.
|
||||
+ */
|
||||
+ unsigned long addr;
|
||||
+
|
||||
+ for ( addr = 0; addr < MB(1); addr += PAGE_SIZE )
|
||||
+ {
|
||||
+ ret = machine_kexec_add_page(kimage, addr, addr);
|
||||
+ if ( ret < 0 )
|
||||
+ goto error;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
ret = kexec_load_slot(kimage);
|
||||
if ( ret < 0 )
|
||||
goto error;
|
||||
--- a/xen/common/kimage.c
|
||||
+++ b/xen/common/kimage.c
|
||||
@@ -923,6 +923,11 @@ int kimage_build_ind(struct kexec_image
|
||||
ret = kimage_add_page(image, page_to_maddr(xen_page));
|
||||
if ( ret < 0 )
|
||||
goto done;
|
||||
+
|
||||
+ ret = machine_kexec_add_page(image, dest, dest);
|
||||
+ if ( ret < 0 )
|
||||
+ goto done;
|
||||
+
|
||||
dest += PAGE_SIZE;
|
||||
break;
|
||||
}
|
@ -1,86 +0,0 @@
|
||||
# Commit 860313f0411d2dcc6b2fd78bfb834b39d05373a6
|
||||
# Date 2015-06-10 12:05:21 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/EFI: adjust EFI_MEMORY_WP handling for spec version 2.5
|
||||
|
||||
That flag now means cachability rather than protection, and a new flag
|
||||
EFI_MEMORY_RO got added in its place.
|
||||
|
||||
Along with EFI_MEMORY_RO also add the two other new EFI_MEMORY_*
|
||||
definitions, even if we don't need them right away.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
|
||||
|
||||
Index: xen-4.5.1-testing/xen/common/efi/boot.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/xen/common/efi/boot.c
|
||||
+++ xen-4.5.1-testing/xen/common/efi/boot.c
|
||||
@@ -32,6 +32,8 @@
|
||||
/* Using SetVirtualAddressMap() is incompatible with kexec: */
|
||||
#undef USE_SET_VIRTUAL_ADDRESS_MAP
|
||||
|
||||
+#define EFI_REVISION(major, minor) (((major) << 16) | (minor))
|
||||
+
|
||||
#define SHIM_LOCK_PROTOCOL_GUID \
|
||||
{ 0x605dab50, 0xe046, 0x4300, {0xab, 0xb6, 0x3d, 0xd8, 0x10, 0xdd, 0x8b, 0x23} }
|
||||
|
||||
@@ -76,6 +78,7 @@ static int set_color(u32 mask, int bpp,
|
||||
static bool_t match_guid(const EFI_GUID *guid1, const EFI_GUID *guid2);
|
||||
|
||||
static const EFI_BOOT_SERVICES *__initdata efi_bs;
|
||||
+static UINT32 __initdata efi_bs_revision;
|
||||
static EFI_HANDLE __initdata efi_ih;
|
||||
|
||||
static SIMPLE_TEXT_OUTPUT_INTERFACE *__initdata StdOut;
|
||||
@@ -714,6 +717,7 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SY
|
||||
|
||||
efi_ih = ImageHandle;
|
||||
efi_bs = SystemTable->BootServices;
|
||||
+ efi_bs_revision = efi_bs->Hdr.Revision;
|
||||
efi_rs = SystemTable->RuntimeServices;
|
||||
efi_ct = SystemTable->ConfigurationTable;
|
||||
efi_num_ct = SystemTable->NumberOfTableEntries;
|
||||
@@ -1221,6 +1225,9 @@ void __init efi_init_memory(void)
|
||||
prot |= _PAGE_PAT | MAP_SMALL_PAGES;
|
||||
else if ( desc->Attribute & (EFI_MEMORY_UC | EFI_MEMORY_UCE) )
|
||||
prot |= _PAGE_PWT | _PAGE_PCD | MAP_SMALL_PAGES;
|
||||
+ else if ( efi_bs_revision >= EFI_REVISION(2, 5) &&
|
||||
+ (desc->Attribute & EFI_MEMORY_WP) )
|
||||
+ prot |= _PAGE_PAT | _PAGE_PWT | MAP_SMALL_PAGES;
|
||||
else
|
||||
{
|
||||
printk(XENLOG_ERR "Unknown cachability for MFNs %#lx-%#lx%s\n",
|
||||
@@ -1230,7 +1237,8 @@ void __init efi_init_memory(void)
|
||||
prot |= _PAGE_PWT | _PAGE_PCD | MAP_SMALL_PAGES;
|
||||
}
|
||||
|
||||
- if ( desc->Attribute & EFI_MEMORY_WP )
|
||||
+ if ( desc->Attribute & (efi_bs_revision < EFI_REVISION(2, 5)
|
||||
+ ? EFI_MEMORY_WP : EFI_MEMORY_RO) )
|
||||
prot &= ~_PAGE_RW;
|
||||
if ( (desc->Attribute & EFI_MEMORY_XP) && cpu_has_nx )
|
||||
prot |= _PAGE_NX_BIT;
|
||||
Index: xen-4.5.1-testing/xen/include/efi/efidef.h
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/xen/include/efi/efidef.h
|
||||
+++ xen-4.5.1-testing/xen/include/efi/efidef.h
|
||||
@@ -156,11 +156,15 @@ typedef enum {
|
||||
#define EFI_MEMORY_WT 0x0000000000000004
|
||||
#define EFI_MEMORY_WB 0x0000000000000008
|
||||
#define EFI_MEMORY_UCE 0x0000000000000010
|
||||
+#define EFI_MEMORY_WP 0x0000000000001000
|
||||
|
||||
// physical memory protection on range
|
||||
-#define EFI_MEMORY_WP 0x0000000000001000
|
||||
#define EFI_MEMORY_RP 0x0000000000002000
|
||||
#define EFI_MEMORY_XP 0x0000000000004000
|
||||
+#define EFI_MEMORY_RO 0x0000000000020000
|
||||
+
|
||||
+#define EFI_MEMORY_NV 0x0000000000008000
|
||||
+#define EFI_MEMORY_MORE_RELIABLE 0x0000000000010000
|
||||
|
||||
// range requires a runtime mapping
|
||||
#define EFI_MEMORY_RUNTIME 0x8000000000000000
|
@ -1,99 +0,0 @@
|
||||
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
|
||||
|
||||
# Commit 284ffb4f9b0d5c3a33c4c5bd87645d0cc342ca96
|
||||
# Date 2015-06-11 11:52:18 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/vMSI-X: support qword MMIO access
|
||||
|
||||
The specification explicitly provides for this, so we should have
|
||||
supported this from the beginning.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/vmsi.c
|
||||
+++ b/xen/arch/x86/hvm/vmsi.c
|
||||
@@ -223,7 +223,7 @@ static int msixtbl_read(
|
||||
unsigned int nr_entry, index;
|
||||
int r = X86EMUL_UNHANDLEABLE;
|
||||
|
||||
- if ( len != 4 || (address & 3) )
|
||||
+ if ( (len != 4 && len != 8) || (address & (len - 1)) )
|
||||
return r;
|
||||
|
||||
rcu_read_lock(&msixtbl_rcu_lock);
|
||||
@@ -241,13 +241,25 @@ static int msixtbl_read(
|
||||
!acc_bit(test, entry, nr_entry, index) )
|
||||
goto out;
|
||||
*pval = entry->gentries[nr_entry].msi_ad[index];
|
||||
+ if ( len == 8 )
|
||||
+ {
|
||||
+ if ( index )
|
||||
+ offset = PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
|
||||
+ else if ( acc_bit(test, entry, nr_entry, 1) )
|
||||
+ *pval |= (u64)entry->gentries[nr_entry].msi_ad[1] << 32;
|
||||
+ else
|
||||
+ goto out;
|
||||
+ }
|
||||
}
|
||||
- else
|
||||
+ if ( offset == PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
|
||||
{
|
||||
virt = msixtbl_addr_to_virt(entry, address);
|
||||
if ( !virt )
|
||||
goto out;
|
||||
- *pval = readl(virt);
|
||||
+ if ( len == 4 )
|
||||
+ *pval = readl(virt);
|
||||
+ else
|
||||
+ *pval |= (u64)readl(virt) << 32;
|
||||
}
|
||||
|
||||
r = X86EMUL_OKAY;
|
||||
@@ -268,7 +280,7 @@ static int msixtbl_write(struct vcpu *v,
|
||||
unsigned long flags, orig;
|
||||
struct irq_desc *desc;
|
||||
|
||||
- if ( len != 4 || (address & 3) )
|
||||
+ if ( (len != 4 && len != 8) || (address & (len - 1)) )
|
||||
return r;
|
||||
|
||||
rcu_read_lock(&msixtbl_rcu_lock);
|
||||
@@ -279,16 +291,23 @@ static int msixtbl_write(struct vcpu *v,
|
||||
nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
|
||||
|
||||
offset = address & (PCI_MSIX_ENTRY_SIZE - 1);
|
||||
- if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET)
|
||||
+ if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
|
||||
{
|
||||
+ index = offset / sizeof(uint32_t);
|
||||
if ( nr_entry < MAX_MSIX_ACC_ENTRIES )
|
||||
{
|
||||
- index = offset / sizeof(uint32_t);
|
||||
entry->gentries[nr_entry].msi_ad[index] = val;
|
||||
acc_bit(set, entry, nr_entry, index);
|
||||
+ if ( len == 8 && !index )
|
||||
+ {
|
||||
+ entry->gentries[nr_entry].msi_ad[1] = val >> 32;
|
||||
+ acc_bit(set, entry, nr_entry, 1);
|
||||
+ }
|
||||
}
|
||||
set_bit(nr_entry, &entry->table_flags);
|
||||
- goto out;
|
||||
+ if ( len != 8 || !index )
|
||||
+ goto out;
|
||||
+ val >>= 32;
|
||||
}
|
||||
|
||||
/* Exit to device model when unmasking and address/data got modified. */
|
||||
@@ -352,7 +371,8 @@ static int msixtbl_write(struct vcpu *v,
|
||||
|
||||
unlock:
|
||||
spin_unlock_irqrestore(&desc->lock, flags);
|
||||
- r = X86EMUL_OKAY;
|
||||
+ if ( len == 4 )
|
||||
+ r = X86EMUL_OKAY;
|
||||
|
||||
out:
|
||||
rcu_read_unlock(&msixtbl_rcu_lock);
|
@ -1,551 +0,0 @@
|
||||
# Commit b4650e9a96d78b87ccf7deb4f74733ccfcc64db5
|
||||
# Date 2015-06-15 13:22:07 +0200
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
gnttab: per-active entry locking
|
||||
|
||||
Introduce a per-active entry spin lock to protect active entry state
|
||||
The grant table lock must be locked before acquiring (locking) an
|
||||
active entry.
|
||||
|
||||
This is a step in reducing contention on the grant table lock, but
|
||||
will only do so once the grant table lock is turned into a read-write
|
||||
lock.
|
||||
|
||||
Based on a patch originally by Matt Wilson <msw@amazon.com>.
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/docs/misc/grant-tables.txt
|
||||
+++ b/docs/misc/grant-tables.txt
|
||||
@@ -63,6 +63,7 @@ is complete.
|
||||
act->domid : remote domain being granted rights
|
||||
act->frame : machine frame being granted
|
||||
act->pin : used to hold reference counts
|
||||
+ act->lock : spinlock used to serialize access to active entry state
|
||||
|
||||
Map tracking
|
||||
~~~~~~~~~~~~
|
||||
@@ -74,7 +75,46 @@ is complete.
|
||||
matching map track entry is then removed, as if unmap had been invoked.
|
||||
These are not used by the transfer mechanism.
|
||||
map->domid : owner of the mapped frame
|
||||
- map->ref_and_flags : grant reference, ro/rw, mapped for host or device access
|
||||
+ map->ref : grant reference
|
||||
+ map->flags : ro/rw, mapped for host or device access
|
||||
+
|
||||
+********************************************************************************
|
||||
+ Locking
|
||||
+ ~~~~~~~
|
||||
+ Xen uses several locks to serialize access to the internal grant table state.
|
||||
+
|
||||
+ grant_table->lock : lock used to prevent readers from accessing
|
||||
+ inconsistent grant table state such as current
|
||||
+ version, partially initialized active table pages,
|
||||
+ etc.
|
||||
+ active_grant_entry->lock : spinlock used to serialize modifications to
|
||||
+ active entries
|
||||
+
|
||||
+ The primary lock for the grant table is a spinlock. All functions
|
||||
+ that access members of struct grant_table must acquire the lock
|
||||
+ around critical sections.
|
||||
+
|
||||
+ Active entries are obtained by calling active_entry_acquire(gt, ref).
|
||||
+ This function returns a pointer to the active entry after locking its
|
||||
+ spinlock. The caller must hold the grant table lock for the gt in
|
||||
+ question before calling active_entry_acquire(). This is because the
|
||||
+ grant table can be dynamically extended via gnttab_grow_table() while
|
||||
+ a domain is running and must be fully initialized. Once all access to
|
||||
+ the active entry is complete, release the lock by calling
|
||||
+ active_entry_release(act).
|
||||
+
|
||||
+ Summary of rules for locking:
|
||||
+ active_entry_acquire() and active_entry_release() can only be
|
||||
+ called when holding the relevant grant table's lock. I.e.:
|
||||
+ spin_lock(>->lock);
|
||||
+ act = active_entry_acquire(gt, ref);
|
||||
+ ...
|
||||
+ active_entry_release(act);
|
||||
+ spin_unlock(>->lock);
|
||||
+
|
||||
+ Active entries cannot be acquired while holding the maptrack lock.
|
||||
+ Multiple active entries can be acquired while holding the grant table
|
||||
+ lock.
|
||||
|
||||
********************************************************************************
|
||||
|
||||
--- a/xen/common/grant_table.c
|
||||
+++ b/xen/common/grant_table.c
|
||||
@@ -157,10 +157,13 @@ struct active_grant_entry {
|
||||
in the page. */
|
||||
unsigned length:16; /* For sub-page grants, the length of the
|
||||
grant. */
|
||||
+ spinlock_t lock; /* lock to protect access of this entry.
|
||||
+ see docs/misc/grant-tables.txt for
|
||||
+ locking protocol */
|
||||
};
|
||||
|
||||
#define ACGNT_PER_PAGE (PAGE_SIZE / sizeof(struct active_grant_entry))
|
||||
-#define active_entry(t, e) \
|
||||
+#define _active_entry(t, e) \
|
||||
((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
|
||||
|
||||
static inline void gnttab_flush_tlb(const struct domain *d)
|
||||
@@ -188,6 +191,24 @@ nr_active_grant_frames(struct grant_tabl
|
||||
return num_act_frames_from_sha_frames(nr_grant_frames(gt));
|
||||
}
|
||||
|
||||
+static inline struct active_grant_entry *
|
||||
+active_entry_acquire(struct grant_table *t, grant_ref_t e)
|
||||
+{
|
||||
+ struct active_grant_entry *act;
|
||||
+
|
||||
+ ASSERT(spin_is_locked(&t->lock));
|
||||
+
|
||||
+ act = &_active_entry(t, e);
|
||||
+ spin_lock(&act->lock);
|
||||
+
|
||||
+ return act;
|
||||
+}
|
||||
+
|
||||
+static inline void active_entry_release(struct active_grant_entry *act)
|
||||
+{
|
||||
+ spin_unlock(&act->lock);
|
||||
+}
|
||||
+
|
||||
/* Check if the page has been paged out, or needs unsharing.
|
||||
If rc == GNTST_okay, *page contains the page struct with a ref taken.
|
||||
Caller must do put_page(*page).
|
||||
@@ -505,7 +526,6 @@ static int grant_map_exists(const struct
|
||||
unsigned long mfn,
|
||||
unsigned int *ref_count)
|
||||
{
|
||||
- const struct active_grant_entry *act;
|
||||
unsigned int ref, max_iter;
|
||||
|
||||
ASSERT(spin_is_locked(&rgt->lock));
|
||||
@@ -514,18 +534,19 @@ static int grant_map_exists(const struct
|
||||
nr_grant_entries(rgt));
|
||||
for ( ref = *ref_count; ref < max_iter; ref++ )
|
||||
{
|
||||
- act = &active_entry(rgt, ref);
|
||||
+ struct active_grant_entry *act;
|
||||
+ bool_t exists;
|
||||
|
||||
- if ( !act->pin )
|
||||
- continue;
|
||||
+ act = active_entry_acquire(rgt, ref);
|
||||
|
||||
- if ( act->domid != ld->domain_id )
|
||||
- continue;
|
||||
+ exists = act->pin
|
||||
+ && act->domid == ld->domain_id
|
||||
+ && act->frame == mfn;
|
||||
|
||||
- if ( act->frame != mfn )
|
||||
- continue;
|
||||
+ active_entry_release(act);
|
||||
|
||||
- return 0;
|
||||
+ if ( exists )
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
if ( ref < nr_grant_entries(rgt) )
|
||||
@@ -546,13 +567,24 @@ static void mapcount(
|
||||
|
||||
*wrc = *rdc = 0;
|
||||
|
||||
+ /*
|
||||
+ * Must have the local domain's grant table lock when iterating
|
||||
+ * over its maptrack entries.
|
||||
+ */
|
||||
+ ASSERT(spin_is_locked(&lgt->lock));
|
||||
+ /*
|
||||
+ * Must have the remote domain's grant table lock while counting
|
||||
+ * its active entries.
|
||||
+ */
|
||||
+ ASSERT(spin_is_locked(&rd->grant_table->lock));
|
||||
+
|
||||
for ( handle = 0; handle < lgt->maptrack_limit; handle++ )
|
||||
{
|
||||
map = &maptrack_entry(lgt, handle);
|
||||
if ( !(map->flags & (GNTMAP_device_map|GNTMAP_host_map)) ||
|
||||
map->domid != rd->domain_id )
|
||||
continue;
|
||||
- if ( active_entry(rd->grant_table, map->ref).frame == mfn )
|
||||
+ if ( _active_entry(rd->grant_table, map->ref).frame == mfn )
|
||||
(map->flags & GNTMAP_readonly) ? (*rdc)++ : (*wrc)++;
|
||||
}
|
||||
}
|
||||
@@ -639,7 +671,7 @@ __gnttab_map_grant_ref(
|
||||
if ( unlikely(op->ref >= nr_grant_entries(rgt)))
|
||||
PIN_FAIL(unlock_out, GNTST_bad_gntref, "Bad ref (%d).\n", op->ref);
|
||||
|
||||
- act = &active_entry(rgt, op->ref);
|
||||
+ act = active_entry_acquire(rgt, op->ref);
|
||||
shah = shared_entry_header(rgt, op->ref);
|
||||
if (rgt->gt_version == 1) {
|
||||
sha1 = &shared_entry_v1(rgt, op->ref);
|
||||
@@ -656,7 +688,7 @@ __gnttab_map_grant_ref(
|
||||
((act->domid != ld->domain_id) ||
|
||||
(act->pin & 0x80808080U) != 0 ||
|
||||
(act->is_sub_page)) )
|
||||
- PIN_FAIL(unlock_out, GNTST_general_error,
|
||||
+ PIN_FAIL(act_release_out, GNTST_general_error,
|
||||
"Bad domain (%d != %d), or risk of counter overflow %08x, or subpage %d\n",
|
||||
act->domid, ld->domain_id, act->pin, act->is_sub_page);
|
||||
|
||||
@@ -667,7 +699,7 @@ __gnttab_map_grant_ref(
|
||||
if ( (rc = _set_status(rgt->gt_version, ld->domain_id,
|
||||
op->flags & GNTMAP_readonly,
|
||||
1, shah, act, status) ) != GNTST_okay )
|
||||
- goto unlock_out;
|
||||
+ goto act_release_out;
|
||||
|
||||
if ( !act->pin )
|
||||
{
|
||||
@@ -702,6 +734,7 @@ __gnttab_map_grant_ref(
|
||||
|
||||
cache_flags = (shah->flags & (GTF_PAT | GTF_PWT | GTF_PCD) );
|
||||
|
||||
+ active_entry_release(act);
|
||||
spin_unlock(&rgt->lock);
|
||||
|
||||
/* pg may be set, with a refcount included, from __get_paged_frame */
|
||||
@@ -839,7 +872,7 @@ __gnttab_map_grant_ref(
|
||||
|
||||
spin_lock(&rgt->lock);
|
||||
|
||||
- act = &active_entry(rgt, op->ref);
|
||||
+ act = active_entry_acquire(rgt, op->ref);
|
||||
|
||||
if ( op->flags & GNTMAP_device_map )
|
||||
act->pin -= (op->flags & GNTMAP_readonly) ?
|
||||
@@ -856,6 +889,9 @@ __gnttab_map_grant_ref(
|
||||
if ( !act->pin )
|
||||
gnttab_clear_flag(_GTF_reading, status);
|
||||
|
||||
+ act_release_out:
|
||||
+ active_entry_release(act);
|
||||
+
|
||||
unlock_out:
|
||||
spin_unlock(&rgt->lock);
|
||||
op->status = rc;
|
||||
@@ -950,7 +986,7 @@ __gnttab_unmap_common(
|
||||
}
|
||||
|
||||
op->rd = rd;
|
||||
- act = &active_entry(rgt, op->map->ref);
|
||||
+ act = active_entry_acquire(rgt, op->map->ref);
|
||||
|
||||
if ( op->frame == 0 )
|
||||
{
|
||||
@@ -959,7 +995,7 @@ __gnttab_unmap_common(
|
||||
else
|
||||
{
|
||||
if ( unlikely(op->frame != act->frame) )
|
||||
- PIN_FAIL(unmap_out, GNTST_general_error,
|
||||
+ PIN_FAIL(act_release_out, GNTST_general_error,
|
||||
"Bad frame number doesn't match gntref. (%lx != %lx)\n",
|
||||
op->frame, act->frame);
|
||||
if ( op->flags & GNTMAP_device_map )
|
||||
@@ -978,7 +1014,7 @@ __gnttab_unmap_common(
|
||||
if ( (rc = replace_grant_host_mapping(op->host_addr,
|
||||
op->frame, op->new_addr,
|
||||
op->flags)) < 0 )
|
||||
- goto unmap_out;
|
||||
+ goto act_release_out;
|
||||
|
||||
ASSERT(act->pin & (GNTPIN_hstw_mask | GNTPIN_hstr_mask));
|
||||
op->map->flags &= ~GNTMAP_host_map;
|
||||
@@ -1000,7 +1036,7 @@ __gnttab_unmap_common(
|
||||
if ( err )
|
||||
{
|
||||
rc = GNTST_general_error;
|
||||
- goto unmap_out;
|
||||
+ goto act_release_out;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1008,8 +1044,11 @@ __gnttab_unmap_common(
|
||||
if ( !(op->flags & GNTMAP_readonly) )
|
||||
gnttab_mark_dirty(rd, op->frame);
|
||||
|
||||
+ act_release_out:
|
||||
+ active_entry_release(act);
|
||||
unmap_out:
|
||||
double_gt_unlock(lgt, rgt);
|
||||
+
|
||||
op->status = rc;
|
||||
rcu_unlock_domain(rd);
|
||||
}
|
||||
@@ -1042,9 +1081,9 @@ __gnttab_unmap_common_complete(struct gn
|
||||
spin_lock(&rgt->lock);
|
||||
|
||||
if ( rgt->gt_version == 0 )
|
||||
- goto unmap_out;
|
||||
+ goto unlock_out;
|
||||
|
||||
- act = &active_entry(rgt, op->map->ref);
|
||||
+ act = active_entry_acquire(rgt, op->map->ref);
|
||||
sha = shared_entry_header(rgt, op->map->ref);
|
||||
|
||||
if ( rgt->gt_version == 1 )
|
||||
@@ -1058,7 +1097,7 @@ __gnttab_unmap_common_complete(struct gn
|
||||
* Suggests that __gntab_unmap_common failed early and so
|
||||
* nothing further to do
|
||||
*/
|
||||
- goto unmap_out;
|
||||
+ goto act_release_out;
|
||||
}
|
||||
|
||||
pg = mfn_to_page(op->frame);
|
||||
@@ -1082,7 +1121,7 @@ __gnttab_unmap_common_complete(struct gn
|
||||
* Suggests that __gntab_unmap_common failed in
|
||||
* replace_grant_host_mapping() so nothing further to do
|
||||
*/
|
||||
- goto unmap_out;
|
||||
+ goto act_release_out;
|
||||
}
|
||||
|
||||
if ( !is_iomem_page(op->frame) )
|
||||
@@ -1103,8 +1142,11 @@ __gnttab_unmap_common_complete(struct gn
|
||||
if ( act->pin == 0 )
|
||||
gnttab_clear_flag(_GTF_reading, status);
|
||||
|
||||
- unmap_out:
|
||||
+ act_release_out:
|
||||
+ active_entry_release(act);
|
||||
+ unlock_out:
|
||||
spin_unlock(&rgt->lock);
|
||||
+
|
||||
if ( put_handle )
|
||||
{
|
||||
op->map->flags = 0;
|
||||
@@ -1296,7 +1338,7 @@ gnttab_grow_table(struct domain *d, unsi
|
||||
/* d's grant table lock must be held by the caller */
|
||||
|
||||
struct grant_table *gt = d->grant_table;
|
||||
- unsigned int i;
|
||||
+ unsigned int i, j;
|
||||
|
||||
ASSERT(req_nr_frames <= max_grant_frames);
|
||||
|
||||
@@ -1311,6 +1353,8 @@ gnttab_grow_table(struct domain *d, unsi
|
||||
if ( (gt->active[i] = alloc_xenheap_page()) == NULL )
|
||||
goto active_alloc_failed;
|
||||
clear_page(gt->active[i]);
|
||||
+ for ( j = 0; j < ACGNT_PER_PAGE; j++ )
|
||||
+ spin_lock_init(>->active[i][j].lock);
|
||||
}
|
||||
|
||||
/* Shared */
|
||||
@@ -1805,7 +1849,7 @@ __release_grant_for_copy(
|
||||
|
||||
spin_lock(&rgt->lock);
|
||||
|
||||
- act = &active_entry(rgt, gref);
|
||||
+ act = active_entry_acquire(rgt, gref);
|
||||
sha = shared_entry_header(rgt, gref);
|
||||
r_frame = act->frame;
|
||||
|
||||
@@ -1844,6 +1888,7 @@ __release_grant_for_copy(
|
||||
released_read = 1;
|
||||
}
|
||||
|
||||
+ active_entry_release(act);
|
||||
spin_unlock(&rgt->lock);
|
||||
|
||||
if ( td != rd )
|
||||
@@ -1905,14 +1950,14 @@ __acquire_grant_for_copy(
|
||||
spin_lock(&rgt->lock);
|
||||
|
||||
if ( rgt->gt_version == 0 )
|
||||
- PIN_FAIL(unlock_out, GNTST_general_error,
|
||||
+ PIN_FAIL(gt_unlock_out, GNTST_general_error,
|
||||
"remote grant table not ready\n");
|
||||
|
||||
if ( unlikely(gref >= nr_grant_entries(rgt)) )
|
||||
- PIN_FAIL(unlock_out, GNTST_bad_gntref,
|
||||
+ PIN_FAIL(gt_unlock_out, GNTST_bad_gntref,
|
||||
"Bad grant reference %ld\n", gref);
|
||||
|
||||
- act = &active_entry(rgt, gref);
|
||||
+ act = active_entry_acquire(rgt, gref);
|
||||
shah = shared_entry_header(rgt, gref);
|
||||
if ( rgt->gt_version == 1 )
|
||||
{
|
||||
@@ -1971,6 +2016,13 @@ __acquire_grant_for_copy(
|
||||
PIN_FAIL(unlock_out_clear, GNTST_general_error,
|
||||
"transitive grant referenced bad domain %d\n",
|
||||
trans_domid);
|
||||
+
|
||||
+ /*
|
||||
+ * __acquire_grant_for_copy() could take the lock on the
|
||||
+ * remote table (if rd == td), so we have to drop the lock
|
||||
+ * here and reacquire
|
||||
+ */
|
||||
+ active_entry_release(act);
|
||||
spin_unlock(&rgt->lock);
|
||||
|
||||
rc = __acquire_grant_for_copy(td, trans_gref, rd->domain_id,
|
||||
@@ -1978,9 +2030,12 @@ __acquire_grant_for_copy(
|
||||
&trans_page_off, &trans_length, 0);
|
||||
|
||||
spin_lock(&rgt->lock);
|
||||
+ act = active_entry_acquire(rgt, gref);
|
||||
+
|
||||
if ( rc != GNTST_okay ) {
|
||||
__fixup_status_for_copy_pin(act, status);
|
||||
rcu_unlock_domain(td);
|
||||
+ active_entry_release(act);
|
||||
spin_unlock(&rgt->lock);
|
||||
return rc;
|
||||
}
|
||||
@@ -1993,6 +2048,7 @@ __acquire_grant_for_copy(
|
||||
{
|
||||
__fixup_status_for_copy_pin(act, status);
|
||||
rcu_unlock_domain(td);
|
||||
+ active_entry_release(act);
|
||||
spin_unlock(&rgt->lock);
|
||||
put_page(*page);
|
||||
return __acquire_grant_for_copy(rd, gref, ldom, readonly,
|
||||
@@ -2061,6 +2117,7 @@ __acquire_grant_for_copy(
|
||||
*length = act->length;
|
||||
*frame = act->frame;
|
||||
|
||||
+ active_entry_release(act);
|
||||
spin_unlock(&rgt->lock);
|
||||
return rc;
|
||||
|
||||
@@ -2073,7 +2130,11 @@ __acquire_grant_for_copy(
|
||||
gnttab_clear_flag(_GTF_reading, status);
|
||||
|
||||
unlock_out:
|
||||
+ active_entry_release(act);
|
||||
+
|
||||
+ gt_unlock_out:
|
||||
spin_unlock(&rgt->lock);
|
||||
+
|
||||
return rc;
|
||||
}
|
||||
|
||||
@@ -2373,7 +2434,6 @@ gnttab_set_version(XEN_GUEST_HANDLE_PARA
|
||||
gnttab_set_version_t op;
|
||||
struct domain *d = current->domain;
|
||||
struct grant_table *gt = d->grant_table;
|
||||
- struct active_grant_entry *act;
|
||||
grant_entry_v1_t reserved_entries[GNTTAB_NR_RESERVED_ENTRIES];
|
||||
long res;
|
||||
int i;
|
||||
@@ -2398,8 +2458,7 @@ gnttab_set_version(XEN_GUEST_HANDLE_PARA
|
||||
{
|
||||
for ( i = GNTTAB_NR_RESERVED_ENTRIES; i < nr_grant_entries(gt); i++ )
|
||||
{
|
||||
- act = &active_entry(gt, i);
|
||||
- if ( act->pin != 0 )
|
||||
+ if ( read_atomic(&_active_entry(gt, i).pin) != 0 )
|
||||
{
|
||||
gdprintk(XENLOG_WARNING,
|
||||
"tried to change grant table version from %d to %d, but some grant entries still in use\n",
|
||||
@@ -2586,7 +2645,8 @@ __gnttab_swap_grant_ref(grant_ref_t ref_
|
||||
{
|
||||
struct domain *d = rcu_lock_current_domain();
|
||||
struct grant_table *gt = d->grant_table;
|
||||
- struct active_grant_entry *act;
|
||||
+ struct active_grant_entry *act_a = NULL;
|
||||
+ struct active_grant_entry *act_b = NULL;
|
||||
s16 rc = GNTST_okay;
|
||||
|
||||
spin_lock(>->lock);
|
||||
@@ -2600,12 +2660,16 @@ __gnttab_swap_grant_ref(grant_ref_t ref_
|
||||
if ( unlikely(ref_b >= nr_grant_entries(d->grant_table)))
|
||||
PIN_FAIL(out, GNTST_bad_gntref, "Bad ref-b (%d).\n", ref_b);
|
||||
|
||||
- act = &active_entry(gt, ref_a);
|
||||
- if ( act->pin )
|
||||
+ /* Swapping the same ref is a no-op. */
|
||||
+ if ( ref_a == ref_b )
|
||||
+ goto out;
|
||||
+
|
||||
+ act_a = active_entry_acquire(gt, ref_a);
|
||||
+ if ( act_a->pin )
|
||||
PIN_FAIL(out, GNTST_eagain, "ref a %ld busy\n", (long)ref_a);
|
||||
|
||||
- act = &active_entry(gt, ref_b);
|
||||
- if ( act->pin )
|
||||
+ act_b = active_entry_acquire(gt, ref_b);
|
||||
+ if ( act_b->pin )
|
||||
PIN_FAIL(out, GNTST_eagain, "ref b %ld busy\n", (long)ref_b);
|
||||
|
||||
if ( gt->gt_version == 1 )
|
||||
@@ -2632,6 +2696,10 @@ __gnttab_swap_grant_ref(grant_ref_t ref_
|
||||
}
|
||||
|
||||
out:
|
||||
+ if ( act_b != NULL )
|
||||
+ active_entry_release(act_b);
|
||||
+ if ( act_a != NULL )
|
||||
+ active_entry_release(act_a);
|
||||
spin_unlock(>->lock);
|
||||
|
||||
rcu_unlock_domain(d);
|
||||
@@ -2941,7 +3009,7 @@ grant_table_create(
|
||||
struct domain *d)
|
||||
{
|
||||
struct grant_table *t;
|
||||
- int i;
|
||||
+ unsigned int i, j;
|
||||
|
||||
if ( (t = xzalloc(struct grant_table)) == NULL )
|
||||
goto no_mem_0;
|
||||
@@ -2960,6 +3028,8 @@ grant_table_create(
|
||||
if ( (t->active[i] = alloc_xenheap_page()) == NULL )
|
||||
goto no_mem_2;
|
||||
clear_page(t->active[i]);
|
||||
+ for ( j = 0; j < ACGNT_PER_PAGE; j++ )
|
||||
+ spin_lock_init(&t->active[i][j].lock);
|
||||
}
|
||||
|
||||
/* Tracking of mapped foreign frames table */
|
||||
@@ -3056,7 +3126,7 @@ gnttab_release_mappings(
|
||||
rgt = rd->grant_table;
|
||||
spin_lock(&rgt->lock);
|
||||
|
||||
- act = &active_entry(rgt, ref);
|
||||
+ act = active_entry_acquire(rgt, ref);
|
||||
sha = shared_entry_header(rgt, ref);
|
||||
if (rgt->gt_version == 1)
|
||||
status = &sha->flags;
|
||||
@@ -3114,6 +3184,7 @@ gnttab_release_mappings(
|
||||
if ( act->pin == 0 )
|
||||
gnttab_clear_flag(_GTF_reading, status);
|
||||
|
||||
+ active_entry_release(act);
|
||||
spin_unlock(&rgt->lock);
|
||||
|
||||
rcu_unlock_domain(rd);
|
||||
@@ -3176,9 +3247,12 @@ static void gnttab_usage_print(struct do
|
||||
uint16_t status;
|
||||
uint64_t frame;
|
||||
|
||||
- act = &active_entry(gt, ref);
|
||||
+ act = active_entry_acquire(gt, ref);
|
||||
if ( !act->pin )
|
||||
+ {
|
||||
+ active_entry_release(act);
|
||||
continue;
|
||||
+ }
|
||||
|
||||
sha = shared_entry_header(gt, ref);
|
||||
|
||||
@@ -3208,6 +3282,7 @@ static void gnttab_usage_print(struct do
|
||||
printk("[%3d] %5d 0x%06lx 0x%08x %5d 0x%06"PRIx64" 0x%02x\n",
|
||||
ref, act->domid, act->frame, act->pin,
|
||||
sha->domid, frame, status);
|
||||
+ active_entry_release(act);
|
||||
}
|
||||
|
||||
out:
|
@ -1,86 +0,0 @@
|
||||
# Commit 5a9899ddc42040e139233a6b1f0f65f3b65eda6d
|
||||
# Date 2015-06-15 13:23:34 +0200
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
gnttab: introduce maptrack lock
|
||||
|
||||
Split grant table lock into two separate locks. One to protect
|
||||
maptrack free list (maptrack_lock) and one for everything else (lock).
|
||||
|
||||
Based on a patch originally by Matt Wilson <msw@amazon.com>.
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/docs/misc/grant-tables.txt
|
||||
+++ b/docs/misc/grant-tables.txt
|
||||
@@ -87,6 +87,7 @@ is complete.
|
||||
inconsistent grant table state such as current
|
||||
version, partially initialized active table pages,
|
||||
etc.
|
||||
+ grant_table->maptrack_lock : spinlock used to protect the maptrack free list
|
||||
active_grant_entry->lock : spinlock used to serialize modifications to
|
||||
active entries
|
||||
|
||||
@@ -94,6 +95,9 @@ is complete.
|
||||
that access members of struct grant_table must acquire the lock
|
||||
around critical sections.
|
||||
|
||||
+ The maptrack free list is protected by its own spinlock. The maptrack
|
||||
+ lock may be locked while holding the grant table lock.
|
||||
+
|
||||
Active entries are obtained by calling active_entry_acquire(gt, ref).
|
||||
This function returns a pointer to the active entry after locking its
|
||||
spinlock. The caller must hold the grant table lock for the gt in
|
||||
--- a/xen/common/grant_table.c
|
||||
+++ b/xen/common/grant_table.c
|
||||
@@ -288,10 +288,10 @@ static inline void
|
||||
put_maptrack_handle(
|
||||
struct grant_table *t, int handle)
|
||||
{
|
||||
- spin_lock(&t->lock);
|
||||
+ spin_lock(&t->maptrack_lock);
|
||||
maptrack_entry(t, handle).ref = t->maptrack_head;
|
||||
t->maptrack_head = handle;
|
||||
- spin_unlock(&t->lock);
|
||||
+ spin_unlock(&t->maptrack_lock);
|
||||
}
|
||||
|
||||
static inline int
|
||||
@@ -303,7 +303,7 @@ get_maptrack_handle(
|
||||
struct grant_mapping *new_mt;
|
||||
unsigned int new_mt_limit, nr_frames;
|
||||
|
||||
- spin_lock(&lgt->lock);
|
||||
+ spin_lock(&lgt->maptrack_lock);
|
||||
|
||||
while ( unlikely((handle = __get_maptrack_handle(lgt)) == -1) )
|
||||
{
|
||||
@@ -332,7 +332,7 @@ get_maptrack_handle(
|
||||
nr_frames + 1);
|
||||
}
|
||||
|
||||
- spin_unlock(&lgt->lock);
|
||||
+ spin_unlock(&lgt->maptrack_lock);
|
||||
|
||||
return handle;
|
||||
}
|
||||
@@ -3016,6 +3016,7 @@ grant_table_create(
|
||||
|
||||
/* Simple stuff. */
|
||||
spin_lock_init(&t->lock);
|
||||
+ spin_lock_init(&t->maptrack_lock);
|
||||
t->nr_grant_frames = INITIAL_NR_GRANT_FRAMES;
|
||||
|
||||
/* Active grant table. */
|
||||
--- a/xen/include/xen/grant_table.h
|
||||
+++ b/xen/include/xen/grant_table.h
|
||||
@@ -82,6 +82,8 @@ struct grant_table {
|
||||
struct grant_mapping **maptrack;
|
||||
unsigned int maptrack_head;
|
||||
unsigned int maptrack_limit;
|
||||
+ /* Lock protecting the maptrack page list, head, and limit */
|
||||
+ spinlock_t maptrack_lock;
|
||||
/* Lock protecting updates to active and shared grant tables. */
|
||||
spinlock_t lock;
|
||||
/* The defined versions are 1 and 2. Set to 0 if we don't know
|
@ -1,733 +0,0 @@
|
||||
# Commit 40de9fffb4cc0b0485aa3391d72e2220b8e1ce12
|
||||
# Date 2015-06-15 13:25:20 +0200
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
gnttab: make the grant table lock a read-write lock
|
||||
|
||||
In combination with the per-active entry locks, the grant table lock
|
||||
can be made a read-write lock since the majority of cases only the
|
||||
read lock is required. The grant table read lock protects against
|
||||
changes to the table version or size (which are done with the write
|
||||
lock held).
|
||||
|
||||
The write lock is also required when two active entries must be
|
||||
acquired.
|
||||
|
||||
The double lock is still required when updating IOMMU page tables.
|
||||
|
||||
With the lock contention being only on the maptrack lock (unless IOMMU
|
||||
updates are required), performance and scalability is improved.
|
||||
|
||||
Based on a patch originally by Matt Wilson <msw@amazon.com>.
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/docs/misc/grant-tables.txt
|
||||
+++ b/docs/misc/grant-tables.txt
|
||||
@@ -83,7 +83,7 @@ is complete.
|
||||
~~~~~~~
|
||||
Xen uses several locks to serialize access to the internal grant table state.
|
||||
|
||||
- grant_table->lock : lock used to prevent readers from accessing
|
||||
+ grant_table->lock : rwlock used to prevent readers from accessing
|
||||
inconsistent grant table state such as current
|
||||
version, partially initialized active table pages,
|
||||
etc.
|
||||
@@ -91,34 +91,43 @@ is complete.
|
||||
active_grant_entry->lock : spinlock used to serialize modifications to
|
||||
active entries
|
||||
|
||||
- The primary lock for the grant table is a spinlock. All functions
|
||||
- that access members of struct grant_table must acquire the lock
|
||||
- around critical sections.
|
||||
+ The primary lock for the grant table is a read/write spinlock. All
|
||||
+ functions that access members of struct grant_table must acquire a
|
||||
+ read lock around critical sections. Any modification to the members
|
||||
+ of struct grant_table (e.g., nr_status_frames, nr_grant_frames,
|
||||
+ active frames, etc.) must only be made if the write lock is
|
||||
+ held. These elements are read-mostly, and read critical sections can
|
||||
+ be large, which makes a rwlock a good choice.
|
||||
|
||||
The maptrack free list is protected by its own spinlock. The maptrack
|
||||
lock may be locked while holding the grant table lock.
|
||||
|
||||
Active entries are obtained by calling active_entry_acquire(gt, ref).
|
||||
This function returns a pointer to the active entry after locking its
|
||||
- spinlock. The caller must hold the grant table lock for the gt in
|
||||
- question before calling active_entry_acquire(). This is because the
|
||||
- grant table can be dynamically extended via gnttab_grow_table() while
|
||||
- a domain is running and must be fully initialized. Once all access to
|
||||
- the active entry is complete, release the lock by calling
|
||||
- active_entry_release(act).
|
||||
+ spinlock. The caller must hold the grant table read lock before
|
||||
+ calling active_entry_acquire(). This is because the grant table can
|
||||
+ be dynamically extended via gnttab_grow_table() while a domain is
|
||||
+ running and must be fully initialized. Once all access to the active
|
||||
+ entry is complete, release the lock by calling active_entry_release(act).
|
||||
|
||||
Summary of rules for locking:
|
||||
active_entry_acquire() and active_entry_release() can only be
|
||||
- called when holding the relevant grant table's lock. I.e.:
|
||||
- spin_lock(>->lock);
|
||||
+ called when holding the relevant grant table's read lock. I.e.:
|
||||
+ read_lock(>->lock);
|
||||
act = active_entry_acquire(gt, ref);
|
||||
...
|
||||
active_entry_release(act);
|
||||
- spin_unlock(>->lock);
|
||||
+ read_unlock(>->lock);
|
||||
|
||||
Active entries cannot be acquired while holding the maptrack lock.
|
||||
Multiple active entries can be acquired while holding the grant table
|
||||
- lock.
|
||||
+ _write_ lock.
|
||||
+
|
||||
+ Maptrack entries are protected by the corresponding active entry
|
||||
+ lock. As an exception, new maptrack entries may be populated without
|
||||
+ holding the lock, provided the flags field is written last. This
|
||||
+ requires any maptrack entry user validates the flags field as
|
||||
+ non-zero first.
|
||||
|
||||
********************************************************************************
|
||||
|
||||
--- a/xen/arch/arm/mm.c
|
||||
+++ b/xen/arch/arm/mm.c
|
||||
@@ -1037,7 +1037,7 @@ int xenmem_add_to_physmap_one(
|
||||
switch ( space )
|
||||
{
|
||||
case XENMAPSPACE_grant_table:
|
||||
- spin_lock(&d->grant_table->lock);
|
||||
+ write_lock(&d->grant_table->lock);
|
||||
|
||||
if ( d->grant_table->gt_version == 0 )
|
||||
d->grant_table->gt_version = 1;
|
||||
@@ -1067,7 +1067,7 @@ int xenmem_add_to_physmap_one(
|
||||
|
||||
t = p2m_ram_rw;
|
||||
|
||||
- spin_unlock(&d->grant_table->lock);
|
||||
+ write_unlock(&d->grant_table->lock);
|
||||
break;
|
||||
case XENMAPSPACE_shared_info:
|
||||
if ( idx != 0 )
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -4595,7 +4595,7 @@ int xenmem_add_to_physmap_one(
|
||||
mfn = virt_to_mfn(d->shared_info);
|
||||
break;
|
||||
case XENMAPSPACE_grant_table:
|
||||
- spin_lock(&d->grant_table->lock);
|
||||
+ write_lock(&d->grant_table->lock);
|
||||
|
||||
if ( d->grant_table->gt_version == 0 )
|
||||
d->grant_table->gt_version = 1;
|
||||
@@ -4617,7 +4617,7 @@ int xenmem_add_to_physmap_one(
|
||||
mfn = virt_to_mfn(d->grant_table->shared_raw[idx]);
|
||||
}
|
||||
|
||||
- spin_unlock(&d->grant_table->lock);
|
||||
+ write_unlock(&d->grant_table->lock);
|
||||
break;
|
||||
case XENMAPSPACE_gmfn_range:
|
||||
case XENMAPSPACE_gmfn:
|
||||
--- a/xen/common/grant_table.c
|
||||
+++ b/xen/common/grant_table.c
|
||||
@@ -196,7 +196,7 @@ active_entry_acquire(struct grant_table
|
||||
{
|
||||
struct active_grant_entry *act;
|
||||
|
||||
- ASSERT(spin_is_locked(&t->lock));
|
||||
+ ASSERT(rw_is_locked(&t->lock));
|
||||
|
||||
act = &_active_entry(t, e);
|
||||
spin_lock(&act->lock);
|
||||
@@ -252,25 +252,29 @@ static int __get_paged_frame(unsigned lo
|
||||
static inline void
|
||||
double_gt_lock(struct grant_table *lgt, struct grant_table *rgt)
|
||||
{
|
||||
+ /*
|
||||
+ * See mapcount() for why the write lock is also required for the
|
||||
+ * remote domain.
|
||||
+ */
|
||||
if ( lgt < rgt )
|
||||
{
|
||||
- spin_lock(&lgt->lock);
|
||||
- spin_lock(&rgt->lock);
|
||||
+ write_lock(&lgt->lock);
|
||||
+ write_lock(&rgt->lock);
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( lgt != rgt )
|
||||
- spin_lock(&rgt->lock);
|
||||
- spin_lock(&lgt->lock);
|
||||
+ write_lock(&rgt->lock);
|
||||
+ write_lock(&lgt->lock);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
double_gt_unlock(struct grant_table *lgt, struct grant_table *rgt)
|
||||
{
|
||||
- spin_unlock(&lgt->lock);
|
||||
+ write_unlock(&lgt->lock);
|
||||
if ( lgt != rgt )
|
||||
- spin_unlock(&rgt->lock);
|
||||
+ write_unlock(&rgt->lock);
|
||||
}
|
||||
|
||||
static inline int
|
||||
@@ -528,7 +532,7 @@ static int grant_map_exists(const struct
|
||||
{
|
||||
unsigned int ref, max_iter;
|
||||
|
||||
- ASSERT(spin_is_locked(&rgt->lock));
|
||||
+ ASSERT(rw_is_locked(&rgt->lock));
|
||||
|
||||
max_iter = min(*ref_count + (1 << GNTTABOP_CONTINUATION_ARG_SHIFT),
|
||||
nr_grant_entries(rgt));
|
||||
@@ -568,15 +572,15 @@ static void mapcount(
|
||||
*wrc = *rdc = 0;
|
||||
|
||||
/*
|
||||
- * Must have the local domain's grant table lock when iterating
|
||||
- * over its maptrack entries.
|
||||
+ * Must have the local domain's grant table write lock when
|
||||
+ * iterating over its maptrack entries.
|
||||
*/
|
||||
- ASSERT(spin_is_locked(&lgt->lock));
|
||||
+ ASSERT(rw_is_write_locked(&lgt->lock));
|
||||
/*
|
||||
- * Must have the remote domain's grant table lock while counting
|
||||
- * its active entries.
|
||||
+ * Must have the remote domain's grant table write lock while
|
||||
+ * counting its active entries.
|
||||
*/
|
||||
- ASSERT(spin_is_locked(&rd->grant_table->lock));
|
||||
+ ASSERT(rw_is_write_locked(&rd->grant_table->lock));
|
||||
|
||||
for ( handle = 0; handle < lgt->maptrack_limit; handle++ )
|
||||
{
|
||||
@@ -616,6 +620,7 @@ __gnttab_map_grant_ref(
|
||||
grant_entry_v2_t *sha2;
|
||||
grant_entry_header_t *shah;
|
||||
uint16_t *status;
|
||||
+ bool_t need_iommu;
|
||||
|
||||
led = current;
|
||||
ld = led->domain;
|
||||
@@ -661,7 +666,7 @@ __gnttab_map_grant_ref(
|
||||
}
|
||||
|
||||
rgt = rd->grant_table;
|
||||
- spin_lock(&rgt->lock);
|
||||
+ read_lock(&rgt->lock);
|
||||
|
||||
if ( rgt->gt_version == 0 )
|
||||
PIN_FAIL(unlock_out, GNTST_general_error,
|
||||
@@ -735,7 +740,7 @@ __gnttab_map_grant_ref(
|
||||
cache_flags = (shah->flags & (GTF_PAT | GTF_PWT | GTF_PCD) );
|
||||
|
||||
active_entry_release(act);
|
||||
- spin_unlock(&rgt->lock);
|
||||
+ read_unlock(&rgt->lock);
|
||||
|
||||
/* pg may be set, with a refcount included, from __get_paged_frame */
|
||||
if ( !pg )
|
||||
@@ -811,12 +816,14 @@ __gnttab_map_grant_ref(
|
||||
goto undo_out;
|
||||
}
|
||||
|
||||
- double_gt_lock(lgt, rgt);
|
||||
-
|
||||
- if ( gnttab_need_iommu_mapping(ld) )
|
||||
+ need_iommu = gnttab_need_iommu_mapping(ld);
|
||||
+ if ( need_iommu )
|
||||
{
|
||||
unsigned int wrc, rdc;
|
||||
int err = 0;
|
||||
+
|
||||
+ double_gt_lock(lgt, rgt);
|
||||
+
|
||||
/* We're not translated, so we know that gmfns and mfns are
|
||||
the same things, so the IOMMU entry is always 1-to-1. */
|
||||
mapcount(lgt, rd, frame, &wrc, &rdc);
|
||||
@@ -842,12 +849,22 @@ __gnttab_map_grant_ref(
|
||||
|
||||
TRACE_1D(TRC_MEM_PAGE_GRANT_MAP, op->dom);
|
||||
|
||||
+ /*
|
||||
+ * All maptrack entry users check mt->flags first before using the
|
||||
+ * other fields so just ensure the flags field is stored last.
|
||||
+ *
|
||||
+ * However, if gnttab_need_iommu_mapping() then this would race
|
||||
+ * with a concurrent mapcount() call (on an unmap, for example)
|
||||
+ * and a lock is required.
|
||||
+ */
|
||||
mt = &maptrack_entry(lgt, handle);
|
||||
mt->domid = op->dom;
|
||||
mt->ref = op->ref;
|
||||
- mt->flags = op->flags;
|
||||
+ wmb();
|
||||
+ write_atomic(&mt->flags, op->flags);
|
||||
|
||||
- double_gt_unlock(lgt, rgt);
|
||||
+ if ( need_iommu )
|
||||
+ double_gt_unlock(lgt, rgt);
|
||||
|
||||
op->dev_bus_addr = (u64)frame << PAGE_SHIFT;
|
||||
op->handle = handle;
|
||||
@@ -870,7 +887,7 @@ __gnttab_map_grant_ref(
|
||||
put_page(pg);
|
||||
}
|
||||
|
||||
- spin_lock(&rgt->lock);
|
||||
+ read_lock(&rgt->lock);
|
||||
|
||||
act = active_entry_acquire(rgt, op->ref);
|
||||
|
||||
@@ -893,7 +910,7 @@ __gnttab_map_grant_ref(
|
||||
active_entry_release(act);
|
||||
|
||||
unlock_out:
|
||||
- spin_unlock(&rgt->lock);
|
||||
+ read_unlock(&rgt->lock);
|
||||
op->status = rc;
|
||||
put_maptrack_handle(lgt, handle);
|
||||
rcu_unlock_domain(rd);
|
||||
@@ -943,18 +960,19 @@ __gnttab_unmap_common(
|
||||
}
|
||||
|
||||
op->map = &maptrack_entry(lgt, op->handle);
|
||||
- spin_lock(&lgt->lock);
|
||||
|
||||
- if ( unlikely(!op->map->flags) )
|
||||
+ read_lock(&lgt->lock);
|
||||
+
|
||||
+ if ( unlikely(!read_atomic(&op->map->flags)) )
|
||||
{
|
||||
- spin_unlock(&lgt->lock);
|
||||
+ read_unlock(&lgt->lock);
|
||||
gdprintk(XENLOG_INFO, "Zero flags for handle (%d).\n", op->handle);
|
||||
op->status = GNTST_bad_handle;
|
||||
return;
|
||||
}
|
||||
|
||||
dom = op->map->domid;
|
||||
- spin_unlock(&lgt->lock);
|
||||
+ read_unlock(&lgt->lock);
|
||||
|
||||
if ( unlikely((rd = rcu_lock_domain_by_id(dom)) == NULL) )
|
||||
{
|
||||
@@ -975,9 +993,10 @@ __gnttab_unmap_common(
|
||||
TRACE_1D(TRC_MEM_PAGE_GRANT_UNMAP, dom);
|
||||
|
||||
rgt = rd->grant_table;
|
||||
- double_gt_lock(lgt, rgt);
|
||||
|
||||
- op->flags = op->map->flags;
|
||||
+ read_lock(&rgt->lock);
|
||||
+
|
||||
+ op->flags = read_atomic(&op->map->flags);
|
||||
if ( unlikely(!op->flags) || unlikely(op->map->domid != dom) )
|
||||
{
|
||||
gdprintk(XENLOG_WARNING, "Unstable handle %u\n", op->handle);
|
||||
@@ -1024,31 +1043,34 @@ __gnttab_unmap_common(
|
||||
act->pin -= GNTPIN_hstw_inc;
|
||||
}
|
||||
|
||||
- if ( gnttab_need_iommu_mapping(ld) )
|
||||
+ act_release_out:
|
||||
+ active_entry_release(act);
|
||||
+ unmap_out:
|
||||
+ read_unlock(&rgt->lock);
|
||||
+
|
||||
+ if ( rc == GNTST_okay && gnttab_need_iommu_mapping(ld) )
|
||||
{
|
||||
unsigned int wrc, rdc;
|
||||
int err = 0;
|
||||
+
|
||||
+ double_gt_lock(lgt, rgt);
|
||||
+
|
||||
mapcount(lgt, rd, op->frame, &wrc, &rdc);
|
||||
if ( (wrc + rdc) == 0 )
|
||||
err = iommu_unmap_page(ld, op->frame);
|
||||
else if ( wrc == 0 )
|
||||
err = iommu_map_page(ld, op->frame, op->frame, IOMMUF_readable);
|
||||
+
|
||||
+ double_gt_unlock(lgt, rgt);
|
||||
+
|
||||
if ( err )
|
||||
- {
|
||||
rc = GNTST_general_error;
|
||||
- goto act_release_out;
|
||||
- }
|
||||
}
|
||||
|
||||
/* If just unmapped a writable mapping, mark as dirtied */
|
||||
- if ( !(op->flags & GNTMAP_readonly) )
|
||||
+ if ( rc == GNTST_okay && !(op->flags & GNTMAP_readonly) )
|
||||
gnttab_mark_dirty(rd, op->frame);
|
||||
|
||||
- act_release_out:
|
||||
- active_entry_release(act);
|
||||
- unmap_out:
|
||||
- double_gt_unlock(lgt, rgt);
|
||||
-
|
||||
op->status = rc;
|
||||
rcu_unlock_domain(rd);
|
||||
}
|
||||
@@ -1078,8 +1100,8 @@ __gnttab_unmap_common_complete(struct gn
|
||||
|
||||
rcu_lock_domain(rd);
|
||||
rgt = rd->grant_table;
|
||||
- spin_lock(&rgt->lock);
|
||||
|
||||
+ read_lock(&rgt->lock);
|
||||
if ( rgt->gt_version == 0 )
|
||||
goto unlock_out;
|
||||
|
||||
@@ -1145,7 +1167,7 @@ __gnttab_unmap_common_complete(struct gn
|
||||
act_release_out:
|
||||
active_entry_release(act);
|
||||
unlock_out:
|
||||
- spin_unlock(&rgt->lock);
|
||||
+ read_unlock(&rgt->lock);
|
||||
|
||||
if ( put_handle )
|
||||
{
|
||||
@@ -1332,11 +1354,13 @@ gnttab_unpopulate_status_frames(struct d
|
||||
gt->nr_status_frames = 0;
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * Grow the grant table. The caller must hold the grant table's
|
||||
+ * write lock before calling this function.
|
||||
+ */
|
||||
int
|
||||
gnttab_grow_table(struct domain *d, unsigned int req_nr_frames)
|
||||
{
|
||||
- /* d's grant table lock must be held by the caller */
|
||||
-
|
||||
struct grant_table *gt = d->grant_table;
|
||||
unsigned int i, j;
|
||||
|
||||
@@ -1442,7 +1466,7 @@ gnttab_setup_table(
|
||||
}
|
||||
|
||||
gt = d->grant_table;
|
||||
- spin_lock(>->lock);
|
||||
+ write_lock(>->lock);
|
||||
|
||||
if ( gt->gt_version == 0 )
|
||||
gt->gt_version = 1;
|
||||
@@ -1470,7 +1494,7 @@ gnttab_setup_table(
|
||||
}
|
||||
|
||||
out3:
|
||||
- spin_unlock(>->lock);
|
||||
+ write_unlock(>->lock);
|
||||
out2:
|
||||
rcu_unlock_domain(d);
|
||||
out1:
|
||||
@@ -1512,13 +1536,13 @@ gnttab_query_size(
|
||||
goto query_out_unlock;
|
||||
}
|
||||
|
||||
- spin_lock(&d->grant_table->lock);
|
||||
+ read_lock(&d->grant_table->lock);
|
||||
|
||||
op.nr_frames = nr_grant_frames(d->grant_table);
|
||||
op.max_nr_frames = max_grant_frames;
|
||||
op.status = GNTST_okay;
|
||||
|
||||
- spin_unlock(&d->grant_table->lock);
|
||||
+ read_unlock(&d->grant_table->lock);
|
||||
|
||||
|
||||
query_out_unlock:
|
||||
@@ -1544,7 +1568,7 @@ gnttab_prepare_for_transfer(
|
||||
union grant_combo scombo, prev_scombo, new_scombo;
|
||||
int retries = 0;
|
||||
|
||||
- spin_lock(&rgt->lock);
|
||||
+ read_lock(&rgt->lock);
|
||||
|
||||
if ( rgt->gt_version == 0 )
|
||||
{
|
||||
@@ -1595,11 +1619,11 @@ gnttab_prepare_for_transfer(
|
||||
scombo = prev_scombo;
|
||||
}
|
||||
|
||||
- spin_unlock(&rgt->lock);
|
||||
+ read_unlock(&rgt->lock);
|
||||
return 1;
|
||||
|
||||
fail:
|
||||
- spin_unlock(&rgt->lock);
|
||||
+ read_unlock(&rgt->lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1614,6 +1638,7 @@ gnttab_transfer(
|
||||
struct gnttab_transfer gop;
|
||||
unsigned long mfn;
|
||||
unsigned int max_bitsize;
|
||||
+ struct active_grant_entry *act;
|
||||
|
||||
for ( i = 0; i < count; i++ )
|
||||
{
|
||||
@@ -1791,7 +1816,8 @@ gnttab_transfer(
|
||||
TRACE_1D(TRC_MEM_PAGE_GRANT_TRANSFER, e->domain_id);
|
||||
|
||||
/* Tell the guest about its new page frame. */
|
||||
- spin_lock(&e->grant_table->lock);
|
||||
+ read_lock(&e->grant_table->lock);
|
||||
+ act = active_entry_acquire(e->grant_table, gop.ref);
|
||||
|
||||
if ( e->grant_table->gt_version == 1 )
|
||||
{
|
||||
@@ -1809,7 +1835,8 @@ gnttab_transfer(
|
||||
shared_entry_header(e->grant_table, gop.ref)->flags |=
|
||||
GTF_transfer_completed;
|
||||
|
||||
- spin_unlock(&e->grant_table->lock);
|
||||
+ active_entry_release(act);
|
||||
+ read_unlock(&e->grant_table->lock);
|
||||
|
||||
rcu_unlock_domain(e);
|
||||
|
||||
@@ -1847,7 +1874,7 @@ __release_grant_for_copy(
|
||||
released_read = 0;
|
||||
released_write = 0;
|
||||
|
||||
- spin_lock(&rgt->lock);
|
||||
+ read_lock(&rgt->lock);
|
||||
|
||||
act = active_entry_acquire(rgt, gref);
|
||||
sha = shared_entry_header(rgt, gref);
|
||||
@@ -1889,7 +1916,7 @@ __release_grant_for_copy(
|
||||
}
|
||||
|
||||
active_entry_release(act);
|
||||
- spin_unlock(&rgt->lock);
|
||||
+ read_unlock(&rgt->lock);
|
||||
|
||||
if ( td != rd )
|
||||
{
|
||||
@@ -1947,7 +1974,7 @@ __acquire_grant_for_copy(
|
||||
|
||||
*page = NULL;
|
||||
|
||||
- spin_lock(&rgt->lock);
|
||||
+ read_lock(&rgt->lock);
|
||||
|
||||
if ( rgt->gt_version == 0 )
|
||||
PIN_FAIL(gt_unlock_out, GNTST_general_error,
|
||||
@@ -2023,20 +2050,20 @@ __acquire_grant_for_copy(
|
||||
* here and reacquire
|
||||
*/
|
||||
active_entry_release(act);
|
||||
- spin_unlock(&rgt->lock);
|
||||
+ read_unlock(&rgt->lock);
|
||||
|
||||
rc = __acquire_grant_for_copy(td, trans_gref, rd->domain_id,
|
||||
readonly, &grant_frame, page,
|
||||
&trans_page_off, &trans_length, 0);
|
||||
|
||||
- spin_lock(&rgt->lock);
|
||||
+ read_lock(&rgt->lock);
|
||||
act = active_entry_acquire(rgt, gref);
|
||||
|
||||
if ( rc != GNTST_okay ) {
|
||||
__fixup_status_for_copy_pin(act, status);
|
||||
rcu_unlock_domain(td);
|
||||
active_entry_release(act);
|
||||
- spin_unlock(&rgt->lock);
|
||||
+ read_unlock(&rgt->lock);
|
||||
return rc;
|
||||
}
|
||||
|
||||
@@ -2049,7 +2076,7 @@ __acquire_grant_for_copy(
|
||||
__fixup_status_for_copy_pin(act, status);
|
||||
rcu_unlock_domain(td);
|
||||
active_entry_release(act);
|
||||
- spin_unlock(&rgt->lock);
|
||||
+ read_unlock(&rgt->lock);
|
||||
put_page(*page);
|
||||
return __acquire_grant_for_copy(rd, gref, ldom, readonly,
|
||||
frame, page, page_off, length,
|
||||
@@ -2118,7 +2145,7 @@ __acquire_grant_for_copy(
|
||||
*frame = act->frame;
|
||||
|
||||
active_entry_release(act);
|
||||
- spin_unlock(&rgt->lock);
|
||||
+ read_unlock(&rgt->lock);
|
||||
return rc;
|
||||
|
||||
unlock_out_clear:
|
||||
@@ -2133,7 +2160,7 @@ __acquire_grant_for_copy(
|
||||
active_entry_release(act);
|
||||
|
||||
gt_unlock_out:
|
||||
- spin_unlock(&rgt->lock);
|
||||
+ read_unlock(&rgt->lock);
|
||||
|
||||
return rc;
|
||||
}
|
||||
@@ -2449,7 +2476,7 @@ gnttab_set_version(XEN_GUEST_HANDLE_PARA
|
||||
if ( gt->gt_version == op.version )
|
||||
goto out;
|
||||
|
||||
- spin_lock(>->lock);
|
||||
+ write_lock(>->lock);
|
||||
/* Make sure that the grant table isn't currently in use when we
|
||||
change the version number, except for the first 8 entries which
|
||||
are allowed to be in use (xenstore/xenconsole keeps them mapped).
|
||||
@@ -2534,7 +2561,7 @@ gnttab_set_version(XEN_GUEST_HANDLE_PARA
|
||||
gt->gt_version = op.version;
|
||||
|
||||
out_unlock:
|
||||
- spin_unlock(>->lock);
|
||||
+ write_unlock(>->lock);
|
||||
|
||||
out:
|
||||
op.version = gt->gt_version;
|
||||
@@ -2590,7 +2617,7 @@ gnttab_get_status_frames(XEN_GUEST_HANDL
|
||||
|
||||
op.status = GNTST_okay;
|
||||
|
||||
- spin_lock(>->lock);
|
||||
+ read_lock(>->lock);
|
||||
|
||||
for ( i = 0; i < op.nr_frames; i++ )
|
||||
{
|
||||
@@ -2599,7 +2626,7 @@ gnttab_get_status_frames(XEN_GUEST_HANDL
|
||||
op.status = GNTST_bad_virt_addr;
|
||||
}
|
||||
|
||||
- spin_unlock(>->lock);
|
||||
+ read_unlock(>->lock);
|
||||
out2:
|
||||
rcu_unlock_domain(d);
|
||||
out1:
|
||||
@@ -2649,7 +2676,7 @@ __gnttab_swap_grant_ref(grant_ref_t ref_
|
||||
struct active_grant_entry *act_b = NULL;
|
||||
s16 rc = GNTST_okay;
|
||||
|
||||
- spin_lock(>->lock);
|
||||
+ write_lock(>->lock);
|
||||
|
||||
if ( gt->gt_version == 0 )
|
||||
PIN_FAIL(out, GNTST_general_error, "grant table not yet set up\n");
|
||||
@@ -2700,7 +2727,7 @@ out:
|
||||
active_entry_release(act_b);
|
||||
if ( act_a != NULL )
|
||||
active_entry_release(act_a);
|
||||
- spin_unlock(>->lock);
|
||||
+ write_unlock(>->lock);
|
||||
|
||||
rcu_unlock_domain(d);
|
||||
|
||||
@@ -2771,12 +2798,12 @@ static int __gnttab_cache_flush(gnttab_c
|
||||
|
||||
if ( d != owner )
|
||||
{
|
||||
- spin_lock(&owner->grant_table->lock);
|
||||
+ read_lock(&owner->grant_table->lock);
|
||||
|
||||
ret = grant_map_exists(d, owner->grant_table, mfn, ref_count);
|
||||
if ( ret != 0 )
|
||||
{
|
||||
- spin_unlock(&owner->grant_table->lock);
|
||||
+ read_unlock(&owner->grant_table->lock);
|
||||
rcu_unlock_domain(d);
|
||||
put_page(page);
|
||||
return ret;
|
||||
@@ -2796,7 +2823,7 @@ static int __gnttab_cache_flush(gnttab_c
|
||||
ret = 0;
|
||||
|
||||
if ( d != owner )
|
||||
- spin_unlock(&owner->grant_table->lock);
|
||||
+ read_unlock(&owner->grant_table->lock);
|
||||
unmap_domain_page(v);
|
||||
put_page(page);
|
||||
|
||||
@@ -3015,7 +3042,7 @@ grant_table_create(
|
||||
goto no_mem_0;
|
||||
|
||||
/* Simple stuff. */
|
||||
- spin_lock_init(&t->lock);
|
||||
+ rwlock_init(&t->lock);
|
||||
spin_lock_init(&t->maptrack_lock);
|
||||
t->nr_grant_frames = INITIAL_NR_GRANT_FRAMES;
|
||||
|
||||
@@ -3125,7 +3152,7 @@ gnttab_release_mappings(
|
||||
}
|
||||
|
||||
rgt = rd->grant_table;
|
||||
- spin_lock(&rgt->lock);
|
||||
+ read_lock(&rgt->lock);
|
||||
|
||||
act = active_entry_acquire(rgt, ref);
|
||||
sha = shared_entry_header(rgt, ref);
|
||||
@@ -3186,7 +3213,7 @@ gnttab_release_mappings(
|
||||
gnttab_clear_flag(_GTF_reading, status);
|
||||
|
||||
active_entry_release(act);
|
||||
- spin_unlock(&rgt->lock);
|
||||
+ read_unlock(&rgt->lock);
|
||||
|
||||
rcu_unlock_domain(rd);
|
||||
|
||||
@@ -3234,7 +3261,7 @@ static void gnttab_usage_print(struct do
|
||||
printk(" -------- active -------- -------- shared --------\n");
|
||||
printk("[ref] localdom mfn pin localdom gmfn flags\n");
|
||||
|
||||
- spin_lock(>->lock);
|
||||
+ read_lock(>->lock);
|
||||
|
||||
if ( gt->gt_version == 0 )
|
||||
goto out;
|
||||
@@ -3287,7 +3314,7 @@ static void gnttab_usage_print(struct do
|
||||
}
|
||||
|
||||
out:
|
||||
- spin_unlock(>->lock);
|
||||
+ read_unlock(>->lock);
|
||||
|
||||
if ( first )
|
||||
printk("grant-table for remote domain:%5d ... "
|
||||
--- a/xen/include/xen/grant_table.h
|
||||
+++ b/xen/include/xen/grant_table.h
|
||||
@@ -64,6 +64,11 @@ struct grant_mapping {
|
||||
|
||||
/* Per-domain grant information. */
|
||||
struct grant_table {
|
||||
+ /*
|
||||
+ * Lock protecting updates to grant table state (version, active
|
||||
+ * entry list, etc.)
|
||||
+ */
|
||||
+ rwlock_t lock;
|
||||
/* Table size. Number of frames shared with guest */
|
||||
unsigned int nr_grant_frames;
|
||||
/* Shared grant table (see include/public/grant_table.h). */
|
||||
@@ -84,8 +89,6 @@ struct grant_table {
|
||||
unsigned int maptrack_limit;
|
||||
/* Lock protecting the maptrack page list, head, and limit */
|
||||
spinlock_t maptrack_lock;
|
||||
- /* Lock protecting updates to active and shared grant tables. */
|
||||
- spinlock_t lock;
|
||||
/* The defined versions are 1 and 2. Set to 0 if we don't know
|
||||
what version to use yet. */
|
||||
unsigned gt_version;
|
||||
@@ -103,7 +106,7 @@ gnttab_release_mappings(
|
||||
struct domain *d);
|
||||
|
||||
/* Increase the size of a domain's grant table.
|
||||
- * Caller must hold d's grant table lock.
|
||||
+ * Caller must hold d's grant table write lock.
|
||||
*/
|
||||
int
|
||||
gnttab_grow_table(struct domain *d, unsigned int req_nr_frames);
|
@ -1,47 +0,0 @@
|
||||
# Commit a622b5ade2bdf79ad95e6088a4041e75253c43f3
|
||||
# Date 2015-06-16 12:30:16 +0200
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
evtchn: factor out freeing an event channel
|
||||
|
||||
We're going to want to free an event channel from two places. Factor out
|
||||
the code into a free_evtchn() function.
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
|
||||
--- a/xen/common/event_channel.c
|
||||
+++ b/xen/common/event_channel.c
|
||||
@@ -194,6 +194,17 @@ static int get_free_port(struct domain *
|
||||
return port;
|
||||
}
|
||||
|
||||
+static void free_evtchn(struct domain *d, struct evtchn *chn)
|
||||
+{
|
||||
+ /* Clear pending event to avoid unexpected behavior on re-bind. */
|
||||
+ evtchn_port_clear_pending(d, chn);
|
||||
+
|
||||
+ /* Reset binding to vcpu0 when the channel is freed. */
|
||||
+ chn->state = ECS_FREE;
|
||||
+ chn->notify_vcpu_id = 0;
|
||||
+
|
||||
+ xsm_evtchn_close_post(chn);
|
||||
+}
|
||||
|
||||
static long evtchn_alloc_unbound(evtchn_alloc_unbound_t *alloc)
|
||||
{
|
||||
@@ -568,14 +579,7 @@ static long __evtchn_close(struct domain
|
||||
BUG();
|
||||
}
|
||||
|
||||
- /* Clear pending event to avoid unexpected behavior on re-bind. */
|
||||
- evtchn_port_clear_pending(d1, chn1);
|
||||
-
|
||||
- /* Reset binding to vcpu0 when the channel is freed. */
|
||||
- chn1->state = ECS_FREE;
|
||||
- chn1->notify_vcpu_id = 0;
|
||||
-
|
||||
- xsm_evtchn_close_post(chn1);
|
||||
+ free_evtchn(d1, chn1);
|
||||
|
||||
out:
|
||||
if ( d2 != NULL )
|
@ -1,63 +0,0 @@
|
||||
# Commit 01280dc19cf3da089f98faf4f524b54b5a191df0
|
||||
# Date 2015-06-18 14:53:23 +0200
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
evtchn: simplify port_is_valid()
|
||||
|
||||
By keeping a count of the number of currently valid event channels,
|
||||
port_is_valid() can be simplified.
|
||||
|
||||
d->valid_evtchns is only increased (while holding d->event_lock), so
|
||||
port_is_valid() may be safely called without taking the lock (this
|
||||
will be useful later).
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
|
||||
--- a/xen/common/event_channel.c
|
||||
+++ b/xen/common/event_channel.c
|
||||
@@ -191,6 +191,8 @@ static int get_free_port(struct domain *
|
||||
return -ENOMEM;
|
||||
bucket_from_port(d, port) = chn;
|
||||
|
||||
+ write_atomic(&d->valid_evtchns, d->valid_evtchns + EVTCHNS_PER_BUCKET);
|
||||
+
|
||||
return port;
|
||||
}
|
||||
|
||||
@@ -1264,6 +1266,7 @@ int evtchn_init(struct domain *d)
|
||||
d->evtchn = alloc_evtchn_bucket(d, 0);
|
||||
if ( !d->evtchn )
|
||||
return -ENOMEM;
|
||||
+ d->valid_evtchns = EVTCHNS_PER_BUCKET;
|
||||
|
||||
spin_lock_init(&d->event_lock);
|
||||
if ( get_free_port(d) != 0 )
|
||||
--- a/xen/include/xen/event.h
|
||||
+++ b/xen/include/xen/event.h
|
||||
@@ -90,11 +90,7 @@ static inline bool_t port_is_valid(struc
|
||||
{
|
||||
if ( p >= d->max_evtchns )
|
||||
return 0;
|
||||
- if ( !d->evtchn )
|
||||
- return 0;
|
||||
- if ( p < EVTCHNS_PER_BUCKET )
|
||||
- return 1;
|
||||
- return group_from_port(d, p) != NULL && bucket_from_port(d, p) != NULL;
|
||||
+ return p < read_atomic(&d->valid_evtchns);
|
||||
}
|
||||
|
||||
static inline struct evtchn *evtchn_from_port(struct domain *d, unsigned int p)
|
||||
--- a/xen/include/xen/sched.h
|
||||
+++ b/xen/include/xen/sched.h
|
||||
@@ -339,8 +339,9 @@ struct domain
|
||||
/* Event channel information. */
|
||||
struct evtchn *evtchn; /* first bucket only */
|
||||
struct evtchn **evtchn_group[NR_EVTCHN_GROUPS]; /* all other buckets */
|
||||
- unsigned int max_evtchns;
|
||||
- unsigned int max_evtchn_port;
|
||||
+ unsigned int max_evtchns; /* number supported by ABI */
|
||||
+ unsigned int max_evtchn_port; /* max permitted port number */
|
||||
+ unsigned int valid_evtchns; /* number of allocated event channels */
|
||||
spinlock_t event_lock;
|
||||
const struct evtchn_port_ops *evtchn_port_ops;
|
||||
struct evtchn_fifo_domain *evtchn_fifo;
|
@ -1,32 +0,0 @@
|
||||
# Commit e156654d4eb2fdeb524e6b40838767a5dc918966
|
||||
# Date 2015-06-18 14:54:25 +0200
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
evtchn: remove the locking when unmasking an event channel
|
||||
|
||||
The event channel lock is no longer required to check if the port is
|
||||
valid.
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
|
||||
--- a/xen/common/event_channel.c
|
||||
+++ b/xen/common/event_channel.c
|
||||
@@ -931,8 +931,6 @@ int evtchn_unmask(unsigned int port)
|
||||
struct domain *d = current->domain;
|
||||
struct evtchn *evtchn;
|
||||
|
||||
- ASSERT(spin_is_locked(&d->event_lock));
|
||||
-
|
||||
if ( unlikely(!port_is_valid(d, port)) )
|
||||
return -EINVAL;
|
||||
|
||||
@@ -1099,9 +1097,7 @@ long do_event_channel_op(int cmd, XEN_GU
|
||||
struct evtchn_unmask unmask;
|
||||
if ( copy_from_guest(&unmask, arg, 1) != 0 )
|
||||
return -EFAULT;
|
||||
- spin_lock(¤t->domain->event_lock);
|
||||
rc = evtchn_unmask(unmask.port);
|
||||
- spin_unlock(¤t->domain->event_lock);
|
||||
break;
|
||||
}
|
||||
|
@ -1,287 +0,0 @@
|
||||
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
|
||||
|
||||
# Commit 236e13ce60e1c0eb0535ad258e74a3789bc0d074
|
||||
# Date 2015-06-19 10:58:45 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/MSI-X: cleanup
|
||||
|
||||
- __pci_enable_msix() now checks that an MSI-X capability was actually
|
||||
found
|
||||
- pass "pos" to msix_capability_init() as both callers already know it
|
||||
(and hence there's no need to re-obtain it)
|
||||
- call __pci_disable_msi{,x}() directly instead of via
|
||||
pci_disable_msi() from __pci_enable_msi{x,}() state validation paths
|
||||
- use msix_control_reg() instead of open coding it
|
||||
- log message adjustments
|
||||
- coding style corrections
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/msi.c
|
||||
+++ b/xen/arch/x86/msi.c
|
||||
@@ -35,6 +35,8 @@
|
||||
static s8 __read_mostly use_msi = -1;
|
||||
boolean_param("msi", use_msi);
|
||||
|
||||
+static void __pci_disable_msix(struct msi_desc *);
|
||||
+
|
||||
/* bitmap indicate which fixed map is free */
|
||||
static DEFINE_SPINLOCK(msix_fixmap_lock);
|
||||
static DECLARE_BITMAP(msix_fixmap_pages, FIX_MSIX_MAX_PAGES);
|
||||
@@ -129,12 +131,14 @@ void msi_compose_msg(unsigned vector, co
|
||||
unsigned dest;
|
||||
|
||||
memset(msg, 0, sizeof(*msg));
|
||||
- if ( !cpumask_intersects(cpu_mask, &cpu_online_map) ) {
|
||||
+ if ( !cpumask_intersects(cpu_mask, &cpu_online_map) )
|
||||
+ {
|
||||
dprintk(XENLOG_ERR,"%s, compose msi message error!!\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
- if ( vector ) {
|
||||
+ if ( vector )
|
||||
+ {
|
||||
cpumask_t *mask = this_cpu(scratch_mask);
|
||||
|
||||
cpumask_and(mask, cpu_mask, &cpu_online_map);
|
||||
@@ -195,8 +199,7 @@ static void read_msi_msg(struct msi_desc
|
||||
}
|
||||
case PCI_CAP_ID_MSIX:
|
||||
{
|
||||
- void __iomem *base;
|
||||
- base = entry->mask_base;
|
||||
+ void __iomem *base = entry->mask_base;
|
||||
|
||||
msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
|
||||
msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
|
||||
@@ -257,8 +260,7 @@ static int write_msi_msg(struct msi_desc
|
||||
}
|
||||
case PCI_CAP_ID_MSIX:
|
||||
{
|
||||
- void __iomem *base;
|
||||
- base = entry->mask_base;
|
||||
+ void __iomem *base = entry->mask_base;
|
||||
|
||||
writel(msg->address_lo,
|
||||
base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
|
||||
@@ -281,7 +283,7 @@ void set_msi_affinity(struct irq_desc *d
|
||||
struct msi_desc *msi_desc = desc->msi_desc;
|
||||
|
||||
dest = set_desc_affinity(desc, mask);
|
||||
- if (dest == BAD_APICID || !msi_desc)
|
||||
+ if ( dest == BAD_APICID || !msi_desc )
|
||||
return;
|
||||
|
||||
ASSERT(spin_is_locked(&desc->lock));
|
||||
@@ -332,11 +334,11 @@ static void msix_set_enable(struct pci_d
|
||||
pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
|
||||
if ( pos )
|
||||
{
|
||||
- control = pci_conf_read16(seg, bus, slot, func, pos + PCI_MSIX_FLAGS);
|
||||
+ control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
|
||||
control &= ~PCI_MSIX_FLAGS_ENABLE;
|
||||
if ( enable )
|
||||
control |= PCI_MSIX_FLAGS_ENABLE;
|
||||
- pci_conf_write16(seg, bus, slot, func, pos + PCI_MSIX_FLAGS, control);
|
||||
+ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -353,9 +355,11 @@ static void msi_set_mask_bit(struct irq_
|
||||
|
||||
ASSERT(spin_is_locked(&desc->lock));
|
||||
BUG_ON(!entry || !entry->dev);
|
||||
- switch (entry->msi_attrib.type) {
|
||||
+ switch ( entry->msi_attrib.type )
|
||||
+ {
|
||||
case PCI_CAP_ID_MSI:
|
||||
- if (entry->msi_attrib.maskbit) {
|
||||
+ if ( entry->msi_attrib.maskbit )
|
||||
+ {
|
||||
u32 mask_bits;
|
||||
u16 seg = entry->dev->seg;
|
||||
u8 bus = entry->dev->bus;
|
||||
@@ -701,13 +705,14 @@ static u64 read_pci_mem_bar(u16 seg, u8
|
||||
* requested MSI-X entries with allocated irqs or non-zero for otherwise.
|
||||
**/
|
||||
static int msix_capability_init(struct pci_dev *dev,
|
||||
+ unsigned int pos,
|
||||
struct msi_info *msi,
|
||||
struct msi_desc **desc,
|
||||
unsigned int nr_entries)
|
||||
{
|
||||
struct arch_msix *msix = dev->msix;
|
||||
struct msi_desc *entry = NULL;
|
||||
- int pos, vf;
|
||||
+ int vf;
|
||||
u16 control;
|
||||
u64 table_paddr;
|
||||
u32 table_offset;
|
||||
@@ -719,7 +724,6 @@ static int msix_capability_init(struct p
|
||||
|
||||
ASSERT(spin_is_locked(&pcidevs_lock));
|
||||
|
||||
- pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
|
||||
control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
|
||||
msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
|
||||
|
||||
@@ -884,10 +888,9 @@ static int __pci_enable_msi(struct msi_i
|
||||
old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSI);
|
||||
if ( old_desc )
|
||||
{
|
||||
- dprintk(XENLOG_WARNING, "irq %d has already mapped to MSI on "
|
||||
- "device %04x:%02x:%02x.%01x\n",
|
||||
- msi->irq, msi->seg, msi->bus,
|
||||
- PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
|
||||
+ printk(XENLOG_WARNING "irq %d already mapped to MSI on %04x:%02x:%02x.%u\n",
|
||||
+ msi->irq, msi->seg, msi->bus,
|
||||
+ PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
|
||||
*desc = old_desc;
|
||||
return 0;
|
||||
}
|
||||
@@ -895,10 +898,10 @@ static int __pci_enable_msi(struct msi_i
|
||||
old_desc = find_msi_entry(pdev, -1, PCI_CAP_ID_MSIX);
|
||||
if ( old_desc )
|
||||
{
|
||||
- dprintk(XENLOG_WARNING, "MSI-X is already in use on "
|
||||
- "device %04x:%02x:%02x.%01x\n", msi->seg, msi->bus,
|
||||
- PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
|
||||
- pci_disable_msi(old_desc);
|
||||
+ printk(XENLOG_WARNING "MSI-X already in use on %04x:%02x:%02x.%u\n",
|
||||
+ msi->seg, msi->bus,
|
||||
+ PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
|
||||
+ __pci_disable_msix(old_desc);
|
||||
}
|
||||
|
||||
return msi_capability_init(pdev, msi->irq, desc, msi->entry_nr);
|
||||
@@ -912,7 +915,6 @@ static void __pci_disable_msi(struct msi
|
||||
msi_set_enable(dev, 0);
|
||||
|
||||
BUG_ON(list_empty(&dev->msi_list));
|
||||
-
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -932,7 +934,7 @@ static void __pci_disable_msi(struct msi
|
||||
**/
|
||||
static int __pci_enable_msix(struct msi_info *msi, struct msi_desc **desc)
|
||||
{
|
||||
- int status, pos, nr_entries;
|
||||
+ int pos, nr_entries;
|
||||
struct pci_dev *pdev;
|
||||
u16 control;
|
||||
u8 slot = PCI_SLOT(msi->devfn);
|
||||
@@ -941,23 +943,22 @@ static int __pci_enable_msix(struct msi_
|
||||
|
||||
ASSERT(spin_is_locked(&pcidevs_lock));
|
||||
pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn);
|
||||
- if ( !pdev )
|
||||
+ pos = pci_find_cap_offset(msi->seg, msi->bus, slot, func, PCI_CAP_ID_MSIX);
|
||||
+ if ( !pdev || !pos )
|
||||
return -ENODEV;
|
||||
|
||||
- pos = pci_find_cap_offset(msi->seg, msi->bus, slot, func, PCI_CAP_ID_MSIX);
|
||||
control = pci_conf_read16(msi->seg, msi->bus, slot, func,
|
||||
msix_control_reg(pos));
|
||||
nr_entries = multi_msix_capable(control);
|
||||
- if (msi->entry_nr >= nr_entries)
|
||||
+ if ( msi->entry_nr >= nr_entries )
|
||||
return -EINVAL;
|
||||
|
||||
old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSIX);
|
||||
if ( old_desc )
|
||||
{
|
||||
- dprintk(XENLOG_WARNING, "irq %d has already mapped to MSIX on "
|
||||
- "device %04x:%02x:%02x.%01x\n",
|
||||
- msi->irq, msi->seg, msi->bus,
|
||||
- PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
|
||||
+ printk(XENLOG_WARNING "irq %d already mapped to MSI-X on %04x:%02x:%02x.%u\n",
|
||||
+ msi->irq, msi->seg, msi->bus,
|
||||
+ PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
|
||||
*desc = old_desc;
|
||||
return 0;
|
||||
}
|
||||
@@ -965,15 +966,13 @@ static int __pci_enable_msix(struct msi_
|
||||
old_desc = find_msi_entry(pdev, -1, PCI_CAP_ID_MSI);
|
||||
if ( old_desc )
|
||||
{
|
||||
- dprintk(XENLOG_WARNING, "MSI is already in use on "
|
||||
- "device %04x:%02x:%02x.%01x\n", msi->seg, msi->bus,
|
||||
- PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
|
||||
- pci_disable_msi(old_desc);
|
||||
-
|
||||
+ printk(XENLOG_WARNING "MSI already in use on %04x:%02x:%02x.%u\n",
|
||||
+ msi->seg, msi->bus,
|
||||
+ PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
|
||||
+ __pci_disable_msi(old_desc);
|
||||
}
|
||||
|
||||
- status = msix_capability_init(pdev, msi, desc, nr_entries);
|
||||
- return status;
|
||||
+ return msix_capability_init(pdev, pos, msi, desc, nr_entries);
|
||||
}
|
||||
|
||||
static void _pci_cleanup_msix(struct arch_msix *msix)
|
||||
@@ -991,19 +990,16 @@ static void _pci_cleanup_msix(struct arc
|
||||
|
||||
static void __pci_disable_msix(struct msi_desc *entry)
|
||||
{
|
||||
- struct pci_dev *dev;
|
||||
- int pos;
|
||||
- u16 control, seg;
|
||||
- u8 bus, slot, func;
|
||||
-
|
||||
- dev = entry->dev;
|
||||
- seg = dev->seg;
|
||||
- bus = dev->bus;
|
||||
- slot = PCI_SLOT(dev->devfn);
|
||||
- func = PCI_FUNC(dev->devfn);
|
||||
+ struct pci_dev *dev = entry->dev;
|
||||
+ u16 seg = dev->seg;
|
||||
+ u8 bus = dev->bus;
|
||||
+ u8 slot = PCI_SLOT(dev->devfn);
|
||||
+ u8 func = PCI_FUNC(dev->devfn);
|
||||
+ unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
|
||||
+ PCI_CAP_ID_MSIX);
|
||||
+ u16 control = pci_conf_read16(seg, bus, slot, func,
|
||||
+ msix_control_reg(entry->msi_attrib.pos));
|
||||
|
||||
- pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
|
||||
- control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
|
||||
msix_set_enable(dev, 0);
|
||||
|
||||
BUG_ON(list_empty(&dev->msi_list));
|
||||
@@ -1045,7 +1041,7 @@ int pci_prepare_msix(u16 seg, u8 bus, u8
|
||||
u16 control = pci_conf_read16(seg, bus, slot, func,
|
||||
msix_control_reg(pos));
|
||||
|
||||
- rc = msix_capability_init(pdev, NULL, NULL,
|
||||
+ rc = msix_capability_init(pdev, pos, NULL, NULL,
|
||||
multi_msix_capable(control));
|
||||
}
|
||||
spin_unlock(&pcidevs_lock);
|
||||
@@ -1064,8 +1060,8 @@ int pci_enable_msi(struct msi_info *msi,
|
||||
if ( !use_msi )
|
||||
return -EPERM;
|
||||
|
||||
- return msi->table_base ? __pci_enable_msix(msi, desc) :
|
||||
- __pci_enable_msi(msi, desc);
|
||||
+ return msi->table_base ? __pci_enable_msix(msi, desc) :
|
||||
+ __pci_enable_msi(msi, desc);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1115,7 +1111,9 @@ int pci_restore_msi_state(struct pci_dev
|
||||
if ( !pdev )
|
||||
return -EINVAL;
|
||||
|
||||
- ret = xsm_resource_setup_pci(XSM_PRIV, (pdev->seg << 16) | (pdev->bus << 8) | pdev->devfn);
|
||||
+ ret = xsm_resource_setup_pci(XSM_PRIV,
|
||||
+ (pdev->seg << 16) | (pdev->bus << 8) |
|
||||
+ pdev->devfn);
|
||||
if ( ret )
|
||||
return ret;
|
||||
|
@ -1,388 +0,0 @@
|
||||
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
|
||||
|
||||
# Commit ad28e42bd1d28d746988ed71654e8aa670629753
|
||||
# Date 2015-06-19 10:59:53 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/MSI: track host and guest masking separately
|
||||
|
||||
In particular we want to avoid losing track of our own intention to
|
||||
have an entry masked. Physical unmasking now happens only when both
|
||||
host and guest requested so.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
# Commit 84d6add5593d865736831d150da7c38588f669f6
|
||||
# Date 2015-07-10 12:36:24 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/MSI: fix guest unmasking when handling IRQ via event channel
|
||||
|
||||
Rather than assuming only PV guests need special treatment (and
|
||||
dealing with that directly when an IRQ gets set up), keep all guest MSI
|
||||
IRQs masked until either the (HVM) guest unmasks them via vMSI or the
|
||||
(PV, PVHVM, or PVH) guest sets up an event channel for it.
|
||||
|
||||
To not further clutter the common evtchn_bind_pirq() with x86-specific
|
||||
code, introduce an arch_evtchn_bind_pirq() hook instead.
|
||||
|
||||
Reported-by: Sander Eikelenboom <linux@eikelenboom.it>
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Tested-by: Sander Eikelenboom <linux@eikelenboom.it>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/hpet.c
|
||||
+++ b/xen/arch/x86/hpet.c
|
||||
@@ -240,7 +240,7 @@ static void hpet_msi_unmask(struct irq_d
|
||||
cfg = hpet_read32(HPET_Tn_CFG(ch->idx));
|
||||
cfg |= HPET_TN_ENABLE;
|
||||
hpet_write32(cfg, HPET_Tn_CFG(ch->idx));
|
||||
- ch->msi.msi_attrib.masked = 0;
|
||||
+ ch->msi.msi_attrib.host_masked = 0;
|
||||
}
|
||||
|
||||
static void hpet_msi_mask(struct irq_desc *desc)
|
||||
@@ -251,7 +251,7 @@ static void hpet_msi_mask(struct irq_des
|
||||
cfg = hpet_read32(HPET_Tn_CFG(ch->idx));
|
||||
cfg &= ~HPET_TN_ENABLE;
|
||||
hpet_write32(cfg, HPET_Tn_CFG(ch->idx));
|
||||
- ch->msi.msi_attrib.masked = 1;
|
||||
+ ch->msi.msi_attrib.host_masked = 1;
|
||||
}
|
||||
|
||||
static int hpet_msi_write(struct hpet_event_channel *ch, struct msi_msg *msg)
|
||||
--- a/xen/arch/x86/hvm/vmsi.c
|
||||
+++ b/xen/arch/x86/hvm/vmsi.c
|
||||
@@ -219,7 +219,6 @@ static int msixtbl_read(
|
||||
{
|
||||
unsigned long offset;
|
||||
struct msixtbl_entry *entry;
|
||||
- void *virt;
|
||||
unsigned int nr_entry, index;
|
||||
int r = X86EMUL_UNHANDLEABLE;
|
||||
|
||||
@@ -253,13 +252,20 @@ static int msixtbl_read(
|
||||
}
|
||||
if ( offset == PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
|
||||
{
|
||||
- virt = msixtbl_addr_to_virt(entry, address);
|
||||
+ const struct msi_desc *msi_desc;
|
||||
+ void *virt = msixtbl_addr_to_virt(entry, address);
|
||||
+
|
||||
if ( !virt )
|
||||
goto out;
|
||||
+ msi_desc = virt_to_msi_desc(entry->pdev, virt);
|
||||
+ if ( !msi_desc )
|
||||
+ goto out;
|
||||
if ( len == 4 )
|
||||
- *pval = readl(virt);
|
||||
+ *pval = MASK_INSR(msi_desc->msi_attrib.guest_masked,
|
||||
+ PCI_MSIX_VECTOR_BITMASK);
|
||||
else
|
||||
- *pval |= (u64)readl(virt) << 32;
|
||||
+ *pval |= (u64)MASK_INSR(msi_desc->msi_attrib.guest_masked,
|
||||
+ PCI_MSIX_VECTOR_BITMASK) << 32;
|
||||
}
|
||||
|
||||
r = X86EMUL_OKAY;
|
||||
@@ -277,7 +283,7 @@ static int msixtbl_write(struct vcpu *v,
|
||||
void *virt;
|
||||
unsigned int nr_entry, index;
|
||||
int r = X86EMUL_UNHANDLEABLE;
|
||||
- unsigned long flags, orig;
|
||||
+ unsigned long flags;
|
||||
struct irq_desc *desc;
|
||||
|
||||
if ( (len != 4 && len != 8) || (address & (len - 1)) )
|
||||
@@ -337,37 +343,7 @@ static int msixtbl_write(struct vcpu *v,
|
||||
|
||||
ASSERT(msi_desc == desc->msi_desc);
|
||||
|
||||
- orig = readl(virt);
|
||||
-
|
||||
- /*
|
||||
- * Do not allow guest to modify MSI-X control bit if it is masked
|
||||
- * by Xen. We'll only handle the case where Xen thinks that
|
||||
- * bit is unmasked, but hardware has silently masked the bit
|
||||
- * (in case of SR-IOV VF reset, etc). On the other hand, if Xen
|
||||
- * thinks that the bit is masked, but it's really not,
|
||||
- * we log a warning.
|
||||
- */
|
||||
- if ( msi_desc->msi_attrib.masked )
|
||||
- {
|
||||
- if ( !(orig & PCI_MSIX_VECTOR_BITMASK) )
|
||||
- printk(XENLOG_WARNING "MSI-X control bit is unmasked when"
|
||||
- " it is expected to be masked [%04x:%02x:%02x.%u]\n",
|
||||
- entry->pdev->seg, entry->pdev->bus,
|
||||
- PCI_SLOT(entry->pdev->devfn),
|
||||
- PCI_FUNC(entry->pdev->devfn));
|
||||
-
|
||||
- goto unlock;
|
||||
- }
|
||||
-
|
||||
- /*
|
||||
- * The mask bit is the only defined bit in the word. But we
|
||||
- * ought to preserve the reserved bits. Clearing the reserved
|
||||
- * bits can result in undefined behaviour (see PCI Local Bus
|
||||
- * Specification revision 2.3).
|
||||
- */
|
||||
- val &= PCI_MSIX_VECTOR_BITMASK;
|
||||
- val |= (orig & ~PCI_MSIX_VECTOR_BITMASK);
|
||||
- writel(val, virt);
|
||||
+ guest_mask_msi_irq(desc, !!(val & PCI_MSIX_VECTOR_BITMASK));
|
||||
|
||||
unlock:
|
||||
spin_unlock_irqrestore(&desc->lock, flags);
|
||||
--- a/xen/arch/x86/irq.c
|
||||
+++ b/xen/arch/x86/irq.c
|
||||
@@ -2502,6 +2502,25 @@ int unmap_domain_pirq_emuirq(struct doma
|
||||
return ret;
|
||||
}
|
||||
|
||||
+void arch_evtchn_bind_pirq(struct domain *d, int pirq)
|
||||
+{
|
||||
+ int irq = domain_pirq_to_irq(d, pirq);
|
||||
+ struct irq_desc *desc;
|
||||
+ unsigned long flags;
|
||||
+
|
||||
+ if ( irq <= 0 )
|
||||
+ return;
|
||||
+
|
||||
+ if ( is_hvm_domain(d) )
|
||||
+ map_domain_emuirq_pirq(d, pirq, IRQ_PT);
|
||||
+
|
||||
+ desc = irq_to_desc(irq);
|
||||
+ spin_lock_irqsave(&desc->lock, flags);
|
||||
+ if ( desc->msi_desc )
|
||||
+ guest_mask_msi_irq(desc, 0);
|
||||
+ spin_unlock_irqrestore(&desc->lock, flags);
|
||||
+}
|
||||
+
|
||||
bool_t hvm_domain_use_pirq(const struct domain *d, const struct pirq *pirq)
|
||||
{
|
||||
return is_hvm_domain(d) && pirq &&
|
||||
--- a/xen/arch/x86/msi.c
|
||||
+++ b/xen/arch/x86/msi.c
|
||||
@@ -349,9 +349,10 @@ int msi_maskable_irq(const struct msi_de
|
||||
|| entry->msi_attrib.maskbit;
|
||||
}
|
||||
|
||||
-static void msi_set_mask_bit(struct irq_desc *desc, int flag)
|
||||
+static void msi_set_mask_bit(struct irq_desc *desc, bool_t host, bool_t guest)
|
||||
{
|
||||
struct msi_desc *entry = desc->msi_desc;
|
||||
+ bool_t flag = host || guest;
|
||||
|
||||
ASSERT(spin_is_locked(&desc->lock));
|
||||
BUG_ON(!entry || !entry->dev);
|
||||
@@ -383,7 +384,8 @@ static void msi_set_mask_bit(struct irq_
|
||||
BUG();
|
||||
break;
|
||||
}
|
||||
- entry->msi_attrib.masked = !!flag;
|
||||
+ entry->msi_attrib.host_masked = host;
|
||||
+ entry->msi_attrib.guest_masked = guest;
|
||||
}
|
||||
|
||||
static int msi_get_mask_bit(const struct msi_desc *entry)
|
||||
@@ -405,20 +407,30 @@ static int msi_get_mask_bit(const struct
|
||||
|
||||
void mask_msi_irq(struct irq_desc *desc)
|
||||
{
|
||||
- msi_set_mask_bit(desc, 1);
|
||||
+ msi_set_mask_bit(desc, 1, desc->msi_desc->msi_attrib.guest_masked);
|
||||
}
|
||||
|
||||
void unmask_msi_irq(struct irq_desc *desc)
|
||||
{
|
||||
- msi_set_mask_bit(desc, 0);
|
||||
+ msi_set_mask_bit(desc, 0, desc->msi_desc->msi_attrib.guest_masked);
|
||||
+}
|
||||
+
|
||||
+void guest_mask_msi_irq(struct irq_desc *desc, bool_t mask)
|
||||
+{
|
||||
+ msi_set_mask_bit(desc, desc->msi_desc->msi_attrib.host_masked, mask);
|
||||
}
|
||||
|
||||
static unsigned int startup_msi_irq(struct irq_desc *desc)
|
||||
{
|
||||
- unmask_msi_irq(desc);
|
||||
+ msi_set_mask_bit(desc, 0, !!(desc->status & IRQ_GUEST));
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static void shutdown_msi_irq(struct irq_desc *desc)
|
||||
+{
|
||||
+ msi_set_mask_bit(desc, 1, 1);
|
||||
+}
|
||||
+
|
||||
void ack_nonmaskable_msi_irq(struct irq_desc *desc)
|
||||
{
|
||||
irq_complete_move(desc);
|
||||
@@ -443,7 +455,7 @@ void end_nonmaskable_msi_irq(struct irq_
|
||||
static hw_irq_controller pci_msi_maskable = {
|
||||
.typename = "PCI-MSI/-X",
|
||||
.startup = startup_msi_irq,
|
||||
- .shutdown = mask_msi_irq,
|
||||
+ .shutdown = shutdown_msi_irq,
|
||||
.enable = unmask_msi_irq,
|
||||
.disable = mask_msi_irq,
|
||||
.ack = ack_maskable_msi_irq,
|
||||
@@ -591,7 +603,8 @@ static int msi_capability_init(struct pc
|
||||
entry[i].msi_attrib.is_64 = is_64bit_address(control);
|
||||
entry[i].msi_attrib.entry_nr = i;
|
||||
entry[i].msi_attrib.maskbit = is_mask_bit_support(control);
|
||||
- entry[i].msi_attrib.masked = 1;
|
||||
+ entry[i].msi_attrib.host_masked = 1;
|
||||
+ entry[i].msi_attrib.guest_masked = 0;
|
||||
entry[i].msi_attrib.pos = pos;
|
||||
if ( entry[i].msi_attrib.maskbit )
|
||||
entry[i].msi.mpos = mpos;
|
||||
@@ -817,7 +830,8 @@ static int msix_capability_init(struct p
|
||||
entry->msi_attrib.is_64 = 1;
|
||||
entry->msi_attrib.entry_nr = msi->entry_nr;
|
||||
entry->msi_attrib.maskbit = 1;
|
||||
- entry->msi_attrib.masked = 1;
|
||||
+ entry->msi_attrib.host_masked = 1;
|
||||
+ entry->msi_attrib.guest_masked = 1;
|
||||
entry->msi_attrib.pos = pos;
|
||||
entry->irq = msi->irq;
|
||||
entry->dev = dev;
|
||||
@@ -1152,7 +1166,8 @@ int pci_restore_msi_state(struct pci_dev
|
||||
|
||||
for ( i = 0; ; )
|
||||
{
|
||||
- msi_set_mask_bit(desc, entry[i].msi_attrib.masked);
|
||||
+ msi_set_mask_bit(desc, entry[i].msi_attrib.host_masked,
|
||||
+ entry[i].msi_attrib.guest_masked);
|
||||
|
||||
if ( !--nr )
|
||||
break;
|
||||
@@ -1304,7 +1319,7 @@ static void dump_msi(unsigned char key)
|
||||
else
|
||||
mask = '?';
|
||||
printk(" %-6s%4u vec=%02x%7s%6s%3sassert%5s%7s"
|
||||
- " dest=%08x mask=%d/%d/%c\n",
|
||||
+ " dest=%08x mask=%d/%c%c/%c\n",
|
||||
type, irq,
|
||||
(data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT,
|
||||
data & MSI_DATA_DELIVERY_LOWPRI ? "lowest" : "fixed",
|
||||
@@ -1312,7 +1327,10 @@ static void dump_msi(unsigned char key)
|
||||
data & MSI_DATA_LEVEL_ASSERT ? "" : "de",
|
||||
addr & MSI_ADDR_DESTMODE_LOGIC ? "log" : "phys",
|
||||
addr & MSI_ADDR_REDIRECTION_LOWPRI ? "lowest" : "cpu",
|
||||
- dest32, attr.maskbit, attr.masked, mask);
|
||||
+ dest32, attr.maskbit,
|
||||
+ attr.host_masked ? 'H' : ' ',
|
||||
+ attr.guest_masked ? 'G' : ' ',
|
||||
+ mask);
|
||||
}
|
||||
}
|
||||
|
||||
--- a/xen/common/event_channel.c
|
||||
+++ b/xen/common/event_channel.c
|
||||
@@ -445,10 +445,7 @@ static long evtchn_bind_pirq(evtchn_bind
|
||||
|
||||
bind->port = port;
|
||||
|
||||
-#ifdef CONFIG_X86
|
||||
- if ( is_hvm_domain(d) && domain_pirq_to_irq(d, pirq) > 0 )
|
||||
- map_domain_emuirq_pirq(d, pirq, IRQ_PT);
|
||||
-#endif
|
||||
+ arch_evtchn_bind_pirq(d, pirq);
|
||||
|
||||
out:
|
||||
spin_unlock(&d->event_lock);
|
||||
--- a/xen/drivers/passthrough/amd/iommu_init.c
|
||||
+++ b/xen/drivers/passthrough/amd/iommu_init.c
|
||||
@@ -451,7 +451,7 @@ static void iommu_msi_unmask(struct irq_
|
||||
spin_lock_irqsave(&iommu->lock, flags);
|
||||
amd_iommu_msi_enable(iommu, IOMMU_CONTROL_ENABLED);
|
||||
spin_unlock_irqrestore(&iommu->lock, flags);
|
||||
- iommu->msi.msi_attrib.masked = 0;
|
||||
+ iommu->msi.msi_attrib.host_masked = 0;
|
||||
}
|
||||
|
||||
static void iommu_msi_mask(struct irq_desc *desc)
|
||||
@@ -464,7 +464,7 @@ static void iommu_msi_mask(struct irq_de
|
||||
spin_lock_irqsave(&iommu->lock, flags);
|
||||
amd_iommu_msi_enable(iommu, IOMMU_CONTROL_DISABLED);
|
||||
spin_unlock_irqrestore(&iommu->lock, flags);
|
||||
- iommu->msi.msi_attrib.masked = 1;
|
||||
+ iommu->msi.msi_attrib.host_masked = 1;
|
||||
}
|
||||
|
||||
static unsigned int iommu_msi_startup(struct irq_desc *desc)
|
||||
--- a/xen/drivers/passthrough/vtd/iommu.c
|
||||
+++ b/xen/drivers/passthrough/vtd/iommu.c
|
||||
@@ -996,7 +996,7 @@ static void dma_msi_unmask(struct irq_de
|
||||
spin_lock_irqsave(&iommu->register_lock, flags);
|
||||
dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
|
||||
spin_unlock_irqrestore(&iommu->register_lock, flags);
|
||||
- iommu->msi.msi_attrib.masked = 0;
|
||||
+ iommu->msi.msi_attrib.host_masked = 0;
|
||||
}
|
||||
|
||||
static void dma_msi_mask(struct irq_desc *desc)
|
||||
@@ -1008,7 +1008,7 @@ static void dma_msi_mask(struct irq_desc
|
||||
spin_lock_irqsave(&iommu->register_lock, flags);
|
||||
dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM);
|
||||
spin_unlock_irqrestore(&iommu->register_lock, flags);
|
||||
- iommu->msi.msi_attrib.masked = 1;
|
||||
+ iommu->msi.msi_attrib.host_masked = 1;
|
||||
}
|
||||
|
||||
static unsigned int dma_msi_startup(struct irq_desc *desc)
|
||||
--- a/xen/include/asm-arm/irq.h
|
||||
+++ b/xen/include/asm-arm/irq.h
|
||||
@@ -44,6 +44,8 @@ int route_irq_to_guest(struct domain *d,
|
||||
const char *devname);
|
||||
void arch_move_irqs(struct vcpu *v);
|
||||
|
||||
+#define arch_evtchn_bind_pirq(d, pirq) ((void)((d) + (pirq)))
|
||||
+
|
||||
/* Set IRQ type for an SPI */
|
||||
int irq_set_spi_type(unsigned int spi, unsigned int type);
|
||||
|
||||
--- a/xen/include/asm-x86/msi.h
|
||||
+++ b/xen/include/asm-x86/msi.h
|
||||
@@ -90,12 +90,13 @@ extern unsigned int pci_msix_get_table_l
|
||||
|
||||
struct msi_desc {
|
||||
struct msi_attrib {
|
||||
- __u8 type : 5; /* {0: unused, 5h:MSI, 11h:MSI-X} */
|
||||
- __u8 maskbit : 1; /* mask-pending bit supported ? */
|
||||
- __u8 masked : 1;
|
||||
+ __u8 type; /* {0: unused, 5h:MSI, 11h:MSI-X} */
|
||||
+ __u8 pos; /* Location of the MSI capability */
|
||||
+ __u8 maskbit : 1; /* mask/pending bit supported ? */
|
||||
__u8 is_64 : 1; /* Address size: 0=32bit 1=64bit */
|
||||
- __u8 pos; /* Location of the msi capability */
|
||||
- __u16 entry_nr; /* specific enabled entry */
|
||||
+ __u8 host_masked : 1;
|
||||
+ __u8 guest_masked : 1;
|
||||
+ __u16 entry_nr; /* specific enabled entry */
|
||||
} msi_attrib;
|
||||
|
||||
struct list_head list;
|
||||
@@ -236,6 +237,7 @@ void msi_compose_msg(unsigned vector, co
|
||||
void __msi_set_enable(u16 seg, u8 bus, u8 slot, u8 func, int pos, int enable);
|
||||
void mask_msi_irq(struct irq_desc *);
|
||||
void unmask_msi_irq(struct irq_desc *);
|
||||
+void guest_mask_msi_irq(struct irq_desc *, bool_t mask);
|
||||
void ack_nonmaskable_msi_irq(struct irq_desc *);
|
||||
void end_nonmaskable_msi_irq(struct irq_desc *, u8 vector);
|
||||
void set_msi_affinity(struct irq_desc *, const cpumask_t *);
|
||||
--- a/xen/include/xen/irq.h
|
||||
+++ b/xen/include/xen/irq.h
|
||||
@@ -172,4 +172,8 @@ unsigned int set_desc_affinity(struct ir
|
||||
unsigned int arch_hwdom_irqs(domid_t);
|
||||
#endif
|
||||
|
||||
+#ifndef arch_evtchn_bind_pirq
|
||||
+void arch_evtchn_bind_pirq(struct domain *, int pirq);
|
||||
+#endif
|
||||
+
|
||||
#endif /* __XEN_IRQ_H__ */
|
@ -1,284 +0,0 @@
|
||||
# Commit dff515dfeac4c1c13422a128c558ac21ddc6c8db
|
||||
# Date 2015-06-19 11:01:24 +0200
|
||||
# Author Malcolm Crossley <malcolm.crossley@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
gnttab: use per-VCPU maptrack free lists
|
||||
|
||||
Performance analysis of aggregate network throughput with many VMs
|
||||
shows that performance is signficantly limited by contention on the
|
||||
maptrack lock when obtaining/releasing maptrack handles from the free
|
||||
list.
|
||||
|
||||
Instead of a single free list use a per-VCPU list. This avoids any
|
||||
contention when obtaining a handle. Handles must be released back to
|
||||
their original list and since this may occur on a different VCPU there
|
||||
is some contention on the destination VCPU's free list tail pointer
|
||||
(but this is much better than a per-domain lock).
|
||||
|
||||
Increase the default maximum number of maptrack frames by 4 times
|
||||
because: a) struct grant_mapping is now 16 bytes (instead of 8); and
|
||||
b) a guest may not evenly distribute all the grant map operations
|
||||
across the VCPUs (meaning some VCPUs need more maptrack entries than
|
||||
others).
|
||||
|
||||
Signed-off-by: Malcolm Crossley <malcolm.crossley@citrix.com>
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/xen/common/domain.c
|
||||
+++ b/xen/common/domain.c
|
||||
@@ -126,6 +126,8 @@ struct vcpu *alloc_vcpu(
|
||||
|
||||
tasklet_init(&v->continue_hypercall_tasklet, NULL, 0);
|
||||
|
||||
+ grant_table_init_vcpu(v);
|
||||
+
|
||||
if ( !zalloc_cpumask_var(&v->cpu_hard_affinity) ||
|
||||
!zalloc_cpumask_var(&v->cpu_hard_affinity_tmp) ||
|
||||
!zalloc_cpumask_var(&v->cpu_hard_affinity_saved) ||
|
||||
--- a/xen/common/grant_table.c
|
||||
+++ b/xen/common/grant_table.c
|
||||
@@ -37,6 +37,7 @@
|
||||
#include <xen/iommu.h>
|
||||
#include <xen/paging.h>
|
||||
#include <xen/keyhandler.h>
|
||||
+#include <xen/vmap.h>
|
||||
#include <xsm/xsm.h>
|
||||
#include <asm/flushtlb.h>
|
||||
|
||||
@@ -57,7 +58,7 @@ integer_param("gnttab_max_frames", max_g
|
||||
* New options allow to set max_maptrack_frames and
|
||||
* map_grant_table_frames independently.
|
||||
*/
|
||||
-#define DEFAULT_MAX_MAPTRACK_FRAMES 256
|
||||
+#define DEFAULT_MAX_MAPTRACK_FRAMES 1024
|
||||
|
||||
static unsigned int __read_mostly max_maptrack_frames;
|
||||
integer_param("gnttab_max_maptrack_frames", max_maptrack_frames);
|
||||
@@ -279,62 +280,103 @@ double_gt_unlock(struct grant_table *lgt
|
||||
|
||||
static inline int
|
||||
__get_maptrack_handle(
|
||||
- struct grant_table *t)
|
||||
+ struct grant_table *t,
|
||||
+ struct vcpu *v)
|
||||
{
|
||||
- unsigned int h;
|
||||
- if ( unlikely((h = t->maptrack_head) == MAPTRACK_TAIL) )
|
||||
+ unsigned int head, next;
|
||||
+
|
||||
+ /* No maptrack pages allocated for this VCPU yet? */
|
||||
+ head = v->maptrack_head;
|
||||
+ if ( unlikely(head == MAPTRACK_TAIL) )
|
||||
return -1;
|
||||
- t->maptrack_head = maptrack_entry(t, h).ref;
|
||||
- return h;
|
||||
+
|
||||
+ /*
|
||||
+ * Always keep one entry in the free list to make it easier to add
|
||||
+ * free entries to the tail.
|
||||
+ */
|
||||
+ next = read_atomic(&maptrack_entry(t, head).ref);
|
||||
+ if ( unlikely(next == MAPTRACK_TAIL) )
|
||||
+ return -1;
|
||||
+
|
||||
+ v->maptrack_head = next;
|
||||
+
|
||||
+ return head;
|
||||
}
|
||||
|
||||
static inline void
|
||||
put_maptrack_handle(
|
||||
struct grant_table *t, int handle)
|
||||
{
|
||||
- spin_lock(&t->maptrack_lock);
|
||||
- maptrack_entry(t, handle).ref = t->maptrack_head;
|
||||
- t->maptrack_head = handle;
|
||||
- spin_unlock(&t->maptrack_lock);
|
||||
+ struct domain *currd = current->domain;
|
||||
+ struct vcpu *v;
|
||||
+ unsigned int prev_tail, cur_tail;
|
||||
+
|
||||
+ /* 1. Set entry to be a tail. */
|
||||
+ maptrack_entry(t, handle).ref = MAPTRACK_TAIL;
|
||||
+
|
||||
+ /* 2. Add entry to the tail of the list on the original VCPU. */
|
||||
+ v = currd->vcpu[maptrack_entry(t, handle).vcpu];
|
||||
+
|
||||
+ cur_tail = read_atomic(&v->maptrack_tail);
|
||||
+ do {
|
||||
+ prev_tail = cur_tail;
|
||||
+ cur_tail = cmpxchg(&v->maptrack_tail, prev_tail, handle);
|
||||
+ } while ( cur_tail != prev_tail );
|
||||
+
|
||||
+ /* 3. Update the old tail entry to point to the new entry. */
|
||||
+ write_atomic(&maptrack_entry(t, prev_tail).ref, handle);
|
||||
}
|
||||
|
||||
static inline int
|
||||
get_maptrack_handle(
|
||||
struct grant_table *lgt)
|
||||
{
|
||||
+ struct vcpu *curr = current;
|
||||
int i;
|
||||
grant_handle_t handle;
|
||||
struct grant_mapping *new_mt;
|
||||
- unsigned int new_mt_limit, nr_frames;
|
||||
+
|
||||
+ handle = __get_maptrack_handle(lgt, curr);
|
||||
+ if ( likely(handle != -1) )
|
||||
+ return handle;
|
||||
|
||||
spin_lock(&lgt->maptrack_lock);
|
||||
|
||||
- while ( unlikely((handle = __get_maptrack_handle(lgt)) == -1) )
|
||||
+ if ( nr_maptrack_frames(lgt) >= max_maptrack_frames )
|
||||
{
|
||||
- nr_frames = nr_maptrack_frames(lgt);
|
||||
- if ( nr_frames >= max_maptrack_frames )
|
||||
- break;
|
||||
+ spin_unlock(&lgt->maptrack_lock);
|
||||
+ return -1;
|
||||
+ }
|
||||
|
||||
- new_mt = alloc_xenheap_page();
|
||||
- if ( !new_mt )
|
||||
- break;
|
||||
+ new_mt = alloc_xenheap_page();
|
||||
+ if ( !new_mt )
|
||||
+ {
|
||||
+ spin_unlock(&lgt->maptrack_lock);
|
||||
+ return -1;
|
||||
+ }
|
||||
+ clear_page(new_mt);
|
||||
|
||||
- clear_page(new_mt);
|
||||
+ /*
|
||||
+ * Use the first new entry and add the remaining entries to the
|
||||
+ * head of the free list.
|
||||
+ */
|
||||
+ handle = lgt->maptrack_limit;
|
||||
|
||||
- new_mt_limit = lgt->maptrack_limit + MAPTRACK_PER_PAGE;
|
||||
+ for ( i = 0; i < MAPTRACK_PER_PAGE; i++ )
|
||||
+ {
|
||||
+ new_mt[i].ref = handle + i + 1;
|
||||
+ new_mt[i].vcpu = curr->vcpu_id;
|
||||
+ }
|
||||
+ new_mt[i - 1].ref = curr->maptrack_head;
|
||||
|
||||
- for ( i = 1; i < MAPTRACK_PER_PAGE; i++ )
|
||||
- new_mt[i - 1].ref = lgt->maptrack_limit + i;
|
||||
- new_mt[i - 1].ref = lgt->maptrack_head;
|
||||
- lgt->maptrack_head = lgt->maptrack_limit;
|
||||
+ /* Set tail directly if this is the first page for this VCPU. */
|
||||
+ if ( curr->maptrack_tail == MAPTRACK_TAIL )
|
||||
+ curr->maptrack_tail = handle + MAPTRACK_PER_PAGE - 1;
|
||||
|
||||
- lgt->maptrack[nr_frames] = new_mt;
|
||||
- smp_wmb();
|
||||
- lgt->maptrack_limit = new_mt_limit;
|
||||
+ curr->maptrack_head = handle + 1;
|
||||
|
||||
- gdprintk(XENLOG_INFO, "Increased maptrack size to %u frames\n",
|
||||
- nr_frames + 1);
|
||||
- }
|
||||
+ lgt->maptrack[nr_maptrack_frames(lgt)] = new_mt;
|
||||
+ lgt->maptrack_limit += MAPTRACK_PER_PAGE;
|
||||
|
||||
spin_unlock(&lgt->maptrack_lock);
|
||||
|
||||
@@ -3061,16 +3103,9 @@ grant_table_create(
|
||||
}
|
||||
|
||||
/* Tracking of mapped foreign frames table */
|
||||
- if ( (t->maptrack = xzalloc_array(struct grant_mapping *,
|
||||
- max_maptrack_frames)) == NULL )
|
||||
+ t->maptrack = vzalloc(max_maptrack_frames * sizeof(*t->maptrack));
|
||||
+ if ( t->maptrack == NULL )
|
||||
goto no_mem_2;
|
||||
- if ( (t->maptrack[0] = alloc_xenheap_page()) == NULL )
|
||||
- goto no_mem_3;
|
||||
- clear_page(t->maptrack[0]);
|
||||
- t->maptrack_limit = MAPTRACK_PER_PAGE;
|
||||
- for ( i = 1; i < MAPTRACK_PER_PAGE; i++ )
|
||||
- t->maptrack[0][i - 1].ref = i;
|
||||
- t->maptrack[0][i - 1].ref = MAPTRACK_TAIL;
|
||||
|
||||
/* Shared grant table. */
|
||||
if ( (t->shared_raw = xzalloc_array(void *, max_grant_frames)) == NULL )
|
||||
@@ -3102,8 +3137,7 @@ grant_table_create(
|
||||
free_xenheap_page(t->shared_raw[i]);
|
||||
xfree(t->shared_raw);
|
||||
no_mem_3:
|
||||
- free_xenheap_page(t->maptrack[0]);
|
||||
- xfree(t->maptrack);
|
||||
+ vfree(t->maptrack);
|
||||
no_mem_2:
|
||||
for ( i = 0;
|
||||
i < num_act_frames_from_sha_frames(INITIAL_NR_GRANT_FRAMES); i++ )
|
||||
@@ -3238,7 +3272,7 @@ grant_table_destroy(
|
||||
|
||||
for ( i = 0; i < nr_maptrack_frames(t); i++ )
|
||||
free_xenheap_page(t->maptrack[i]);
|
||||
- xfree(t->maptrack);
|
||||
+ vfree(t->maptrack);
|
||||
|
||||
for ( i = 0; i < nr_active_grant_frames(t); i++ )
|
||||
free_xenheap_page(t->active[i]);
|
||||
@@ -3252,6 +3286,12 @@ grant_table_destroy(
|
||||
d->grant_table = NULL;
|
||||
}
|
||||
|
||||
+void grant_table_init_vcpu(struct vcpu *v)
|
||||
+{
|
||||
+ v->maptrack_head = MAPTRACK_TAIL;
|
||||
+ v->maptrack_tail = MAPTRACK_TAIL;
|
||||
+}
|
||||
+
|
||||
static void gnttab_usage_print(struct domain *rd)
|
||||
{
|
||||
int first = 1;
|
||||
--- a/xen/include/xen/grant_table.h
|
||||
+++ b/xen/include/xen/grant_table.h
|
||||
@@ -60,6 +60,8 @@ struct grant_mapping {
|
||||
u32 ref; /* grant ref */
|
||||
u16 flags; /* 0-4: GNTMAP_* ; 5-15: unused */
|
||||
domid_t domid; /* granting domain */
|
||||
+ u32 vcpu; /* vcpu which created the grant mapping */
|
||||
+ u32 pad; /* round size to a power of 2 */
|
||||
};
|
||||
|
||||
/* Per-domain grant information. */
|
||||
@@ -83,9 +85,8 @@ struct grant_table {
|
||||
grant_status_t **status;
|
||||
/* Active grant table. */
|
||||
struct active_grant_entry **active;
|
||||
- /* Mapping tracking table. */
|
||||
+ /* Mapping tracking table per vcpu. */
|
||||
struct grant_mapping **maptrack;
|
||||
- unsigned int maptrack_head;
|
||||
unsigned int maptrack_limit;
|
||||
/* Lock protecting the maptrack page list, head, and limit */
|
||||
spinlock_t maptrack_lock;
|
||||
@@ -99,6 +100,7 @@ int grant_table_create(
|
||||
struct domain *d);
|
||||
void grant_table_destroy(
|
||||
struct domain *d);
|
||||
+void grant_table_init_vcpu(struct vcpu *v);
|
||||
|
||||
/* Domain death release of granted mappings of other domains' memory. */
|
||||
void
|
||||
--- a/xen/include/xen/sched.h
|
||||
+++ b/xen/include/xen/sched.h
|
||||
@@ -219,6 +219,10 @@ struct vcpu
|
||||
/* VCPU paused by system controller. */
|
||||
int controller_pause_count;
|
||||
|
||||
+ /* Maptrack */
|
||||
+ unsigned int maptrack_head;
|
||||
+ unsigned int maptrack_tail;
|
||||
+
|
||||
/* IRQ-safe virq_lock protects against delivering VIRQ to stale evtchn. */
|
||||
evtchn_port_t virq_to_evtchn[NR_VIRQS];
|
||||
spinlock_t virq_lock;
|
@ -1,153 +0,0 @@
|
||||
# Commit e76ff6c156906b515c2a4300a81c95886ece5d5f
|
||||
# Date 2015-06-19 11:02:04 +0200
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
gnttab: steal maptrack entries from other VCPUs
|
||||
|
||||
If a guest is not evenly grant mapping across its VCPUs one of the
|
||||
VCPUs may run out of free maptrack entries even though other VCPUs
|
||||
have many free.
|
||||
|
||||
If this happens, "steal" free entries from other VCPUs. We want to
|
||||
steal entries such that:
|
||||
|
||||
a) We avoid ping-ponging stolen entries between VCPUs.
|
||||
|
||||
b) The number of free entries owned by each VCPUs tends (over time) to
|
||||
the number it uses.
|
||||
|
||||
So when stealing, we select a VCPU at random (reducing (a)) and we
|
||||
transfer the stolen entries to the thief VCPU (aiming for (b)).
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/xen/common/grant_table.c
|
||||
+++ b/xen/common/grant_table.c
|
||||
@@ -283,26 +283,70 @@ __get_maptrack_handle(
|
||||
struct grant_table *t,
|
||||
struct vcpu *v)
|
||||
{
|
||||
- unsigned int head, next;
|
||||
+ unsigned int head, next, prev_head;
|
||||
|
||||
- /* No maptrack pages allocated for this VCPU yet? */
|
||||
- head = v->maptrack_head;
|
||||
- if ( unlikely(head == MAPTRACK_TAIL) )
|
||||
- return -1;
|
||||
-
|
||||
- /*
|
||||
- * Always keep one entry in the free list to make it easier to add
|
||||
- * free entries to the tail.
|
||||
- */
|
||||
- next = read_atomic(&maptrack_entry(t, head).ref);
|
||||
- if ( unlikely(next == MAPTRACK_TAIL) )
|
||||
- return -1;
|
||||
+ do {
|
||||
+ /* No maptrack pages allocated for this VCPU yet? */
|
||||
+ head = read_atomic(&v->maptrack_head);
|
||||
+ if ( unlikely(head == MAPTRACK_TAIL) )
|
||||
+ return -1;
|
||||
|
||||
- v->maptrack_head = next;
|
||||
+ /*
|
||||
+ * Always keep one entry in the free list to make it easier to
|
||||
+ * add free entries to the tail.
|
||||
+ */
|
||||
+ next = read_atomic(&maptrack_entry(t, head).ref);
|
||||
+ if ( unlikely(next == MAPTRACK_TAIL) )
|
||||
+ return -1;
|
||||
+
|
||||
+ prev_head = head;
|
||||
+ head = cmpxchg(&v->maptrack_head, prev_head, next);
|
||||
+ } while ( head != prev_head );
|
||||
|
||||
return head;
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * Try to "steal" a free maptrack entry from another VCPU.
|
||||
+ *
|
||||
+ * A stolen entry is transferred to the thief, so the number of
|
||||
+ * entries for each VCPU should tend to the usage pattern.
|
||||
+ *
|
||||
+ * To avoid having to atomically count the number of free entries on
|
||||
+ * each VCPU and to avoid two VCPU repeatedly stealing entries from
|
||||
+ * each other, the initial victim VCPU is selected randomly.
|
||||
+ */
|
||||
+static int steal_maptrack_handle(struct grant_table *t,
|
||||
+ const struct vcpu *curr)
|
||||
+{
|
||||
+ const struct domain *currd = curr->domain;
|
||||
+ unsigned int first, i;
|
||||
+
|
||||
+ /* Find an initial victim. */
|
||||
+ first = i = get_random() % currd->max_vcpus;
|
||||
+
|
||||
+ do {
|
||||
+ if ( currd->vcpu[i] )
|
||||
+ {
|
||||
+ int handle;
|
||||
+
|
||||
+ handle = __get_maptrack_handle(t, currd->vcpu[i]);
|
||||
+ if ( handle != -1 )
|
||||
+ {
|
||||
+ maptrack_entry(t, handle).vcpu = curr->vcpu_id;
|
||||
+ return handle;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ i++;
|
||||
+ if ( i == currd->max_vcpus )
|
||||
+ i = 0;
|
||||
+ } while ( i != first );
|
||||
+
|
||||
+ /* No free handles on any VCPU. */
|
||||
+ return -1;
|
||||
+}
|
||||
+
|
||||
static inline void
|
||||
put_maptrack_handle(
|
||||
struct grant_table *t, int handle)
|
||||
@@ -342,10 +386,31 @@ get_maptrack_handle(
|
||||
|
||||
spin_lock(&lgt->maptrack_lock);
|
||||
|
||||
+ /*
|
||||
+ * If we've run out of frames, try stealing an entry from another
|
||||
+ * VCPU (in case the guest isn't mapping across its VCPUs evenly).
|
||||
+ */
|
||||
if ( nr_maptrack_frames(lgt) >= max_maptrack_frames )
|
||||
{
|
||||
+ /*
|
||||
+ * Can drop the lock since no other VCPU can be adding a new
|
||||
+ * frame once they've run out.
|
||||
+ */
|
||||
spin_unlock(&lgt->maptrack_lock);
|
||||
- return -1;
|
||||
+
|
||||
+ /*
|
||||
+ * Uninitialized free list? Steal an extra entry for the tail
|
||||
+ * sentinel.
|
||||
+ */
|
||||
+ if ( curr->maptrack_tail == MAPTRACK_TAIL )
|
||||
+ {
|
||||
+ handle = steal_maptrack_handle(lgt, curr);
|
||||
+ if ( handle == -1 )
|
||||
+ return -1;
|
||||
+ curr->maptrack_tail = handle;
|
||||
+ write_atomic(&curr->maptrack_head, handle);
|
||||
+ }
|
||||
+ return steal_maptrack_handle(lgt, curr);
|
||||
}
|
||||
|
||||
new_mt = alloc_xenheap_page();
|
||||
@@ -373,7 +438,7 @@ get_maptrack_handle(
|
||||
if ( curr->maptrack_tail == MAPTRACK_TAIL )
|
||||
curr->maptrack_tail = handle + MAPTRACK_PER_PAGE - 1;
|
||||
|
||||
- curr->maptrack_head = handle + 1;
|
||||
+ write_atomic(&curr->maptrack_head, handle + 1);
|
||||
|
||||
lgt->maptrack[nr_maptrack_frames(lgt)] = new_mt;
|
||||
lgt->maptrack_limit += MAPTRACK_PER_PAGE;
|
@ -1,105 +0,0 @@
|
||||
# Commit b399386bcdb9d458f5647476a06fe86f5968d87e
|
||||
# Date 2015-06-22 11:36:17 +0200
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
evtchn: clear xen_consumer when clearing state
|
||||
|
||||
Freeing a xen event channel would clear xen_consumer before clearing
|
||||
the channel state, leaving a window where the channel is in a funny
|
||||
state (still bound but no consumer).
|
||||
|
||||
Move the clear of xen_consumer into free_evtchn() where the state is
|
||||
also cleared.
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
|
||||
Ditch the pointless evtchn_close() wrapper around __evtchn_close()
|
||||
(renaming the latter) as well as some bogus casts of function results
|
||||
to void.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/xen/common/event_channel.c
|
||||
+++ b/xen/common/event_channel.c
|
||||
@@ -204,6 +204,7 @@ static void free_evtchn(struct domain *d
|
||||
/* Reset binding to vcpu0 when the channel is freed. */
|
||||
chn->state = ECS_FREE;
|
||||
chn->notify_vcpu_id = 0;
|
||||
+ chn->xen_consumer = 0;
|
||||
|
||||
xsm_evtchn_close_post(chn);
|
||||
}
|
||||
@@ -467,7 +468,7 @@ static long evtchn_bind_pirq(evtchn_bind
|
||||
}
|
||||
|
||||
|
||||
-static long __evtchn_close(struct domain *d1, int port1)
|
||||
+static long evtchn_close(struct domain *d1, int port1, bool_t guest)
|
||||
{
|
||||
struct domain *d2 = NULL;
|
||||
struct vcpu *v;
|
||||
@@ -487,7 +488,7 @@ static long __evtchn_close(struct domain
|
||||
chn1 = evtchn_from_port(d1, port1);
|
||||
|
||||
/* Guest cannot close a Xen-attached event channel. */
|
||||
- if ( unlikely(consumer_is_xen(chn1)) )
|
||||
+ if ( unlikely(consumer_is_xen(chn1)) && guest )
|
||||
{
|
||||
rc = -EINVAL;
|
||||
goto out;
|
||||
@@ -596,12 +597,6 @@ static long __evtchn_close(struct domain
|
||||
return rc;
|
||||
}
|
||||
|
||||
-
|
||||
-static long evtchn_close(evtchn_close_t *close)
|
||||
-{
|
||||
- return __evtchn_close(current->domain, close->port);
|
||||
-}
|
||||
-
|
||||
int evtchn_send(struct domain *d, unsigned int lport)
|
||||
{
|
||||
struct evtchn *lchn, *rchn;
|
||||
@@ -956,7 +951,7 @@ static long evtchn_reset(evtchn_reset_t
|
||||
goto out;
|
||||
|
||||
for ( i = 0; port_is_valid(d, i); i++ )
|
||||
- (void)__evtchn_close(d, i);
|
||||
+ evtchn_close(d, i, 1);
|
||||
|
||||
spin_lock(&d->event_lock);
|
||||
|
||||
@@ -1063,7 +1058,7 @@ long do_event_channel_op(int cmd, XEN_GU
|
||||
struct evtchn_close close;
|
||||
if ( copy_from_guest(&close, arg, 1) != 0 )
|
||||
return -EFAULT;
|
||||
- rc = evtchn_close(&close);
|
||||
+ rc = evtchn_close(current->domain, close.port, 1);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -1193,11 +1188,10 @@ void free_xen_event_channel(
|
||||
BUG_ON(!port_is_valid(d, port));
|
||||
chn = evtchn_from_port(d, port);
|
||||
BUG_ON(!consumer_is_xen(chn));
|
||||
- chn->xen_consumer = 0;
|
||||
|
||||
spin_unlock(&d->event_lock);
|
||||
|
||||
- (void)__evtchn_close(d, port);
|
||||
+ evtchn_close(d, port, 0);
|
||||
}
|
||||
|
||||
|
||||
@@ -1296,10 +1290,7 @@ void evtchn_destroy(struct domain *d)
|
||||
|
||||
/* Close all existing event channels. */
|
||||
for ( i = 0; port_is_valid(d, i); i++ )
|
||||
- {
|
||||
- evtchn_from_port(d, i)->xen_consumer = 0;
|
||||
- (void)__evtchn_close(d, i);
|
||||
- }
|
||||
+ evtchn_close(d, i, 0);
|
||||
|
||||
/* Free all event-channel buckets. */
|
||||
spin_lock(&d->event_lock);
|
@ -1,110 +0,0 @@
|
||||
# Commit a753f0e53ff973a8a066e86c1cb3d6dd5c68d59f
|
||||
# Date 2015-06-22 11:38:01 +0200
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
evtchn: defer freeing struct evtchn's until evtchn_destroy_final()
|
||||
|
||||
notify_via_xen_event_channel() and free_xen_event_channel() had to
|
||||
check if the domain was dying because they may be called while the
|
||||
domain is being destroyed and the struct evtchn's are being freed.
|
||||
|
||||
By deferring the freeing of the struct evtchn's until all references
|
||||
to the domain are dropped, these functions can rely on the channel
|
||||
state being present and valid.
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
|
||||
--- a/xen/common/event_channel.c
|
||||
+++ b/xen/common/event_channel.c
|
||||
@@ -1174,22 +1174,8 @@ int alloc_unbound_xen_event_channel(
|
||||
void free_xen_event_channel(
|
||||
struct vcpu *local_vcpu, int port)
|
||||
{
|
||||
- struct evtchn *chn;
|
||||
struct domain *d = local_vcpu->domain;
|
||||
-
|
||||
- spin_lock(&d->event_lock);
|
||||
-
|
||||
- if ( unlikely(d->is_dying) )
|
||||
- {
|
||||
- spin_unlock(&d->event_lock);
|
||||
- return;
|
||||
- }
|
||||
-
|
||||
BUG_ON(!port_is_valid(d, port));
|
||||
- chn = evtchn_from_port(d, port);
|
||||
- BUG_ON(!consumer_is_xen(chn));
|
||||
-
|
||||
- spin_unlock(&d->event_lock);
|
||||
|
||||
evtchn_close(d, port, 0);
|
||||
}
|
||||
@@ -1203,18 +1189,12 @@ void notify_via_xen_event_channel(struct
|
||||
|
||||
spin_lock(&ld->event_lock);
|
||||
|
||||
- if ( unlikely(ld->is_dying) )
|
||||
- {
|
||||
- spin_unlock(&ld->event_lock);
|
||||
- return;
|
||||
- }
|
||||
-
|
||||
ASSERT(port_is_valid(ld, lport));
|
||||
lchn = evtchn_from_port(ld, lport);
|
||||
- ASSERT(consumer_is_xen(lchn));
|
||||
|
||||
if ( likely(lchn->state == ECS_INTERDOMAIN) )
|
||||
{
|
||||
+ ASSERT(consumer_is_xen(lchn));
|
||||
rd = lchn->u.interdomain.remote_dom;
|
||||
rport = lchn->u.interdomain.remote_port;
|
||||
rchn = evtchn_from_port(rd, rport);
|
||||
@@ -1282,7 +1262,7 @@ int evtchn_init(struct domain *d)
|
||||
|
||||
void evtchn_destroy(struct domain *d)
|
||||
{
|
||||
- unsigned int i, j;
|
||||
+ unsigned int i;
|
||||
|
||||
/* After this barrier no new event-channel allocations can occur. */
|
||||
BUG_ON(!d->is_dying);
|
||||
@@ -1292,8 +1272,17 @@ void evtchn_destroy(struct domain *d)
|
||||
for ( i = 0; port_is_valid(d, i); i++ )
|
||||
evtchn_close(d, i, 0);
|
||||
|
||||
+ clear_global_virq_handlers(d);
|
||||
+
|
||||
+ evtchn_fifo_destroy(d);
|
||||
+}
|
||||
+
|
||||
+
|
||||
+void evtchn_destroy_final(struct domain *d)
|
||||
+{
|
||||
+ unsigned int i, j;
|
||||
+
|
||||
/* Free all event-channel buckets. */
|
||||
- spin_lock(&d->event_lock);
|
||||
for ( i = 0; i < NR_EVTCHN_GROUPS; i++ )
|
||||
{
|
||||
if ( !d->evtchn_group[i] )
|
||||
@@ -1301,20 +1290,9 @@ void evtchn_destroy(struct domain *d)
|
||||
for ( j = 0; j < BUCKETS_PER_GROUP; j++ )
|
||||
free_evtchn_bucket(d, d->evtchn_group[i][j]);
|
||||
xfree(d->evtchn_group[i]);
|
||||
- d->evtchn_group[i] = NULL;
|
||||
}
|
||||
free_evtchn_bucket(d, d->evtchn);
|
||||
- d->evtchn = NULL;
|
||||
- spin_unlock(&d->event_lock);
|
||||
|
||||
- clear_global_virq_handlers(d);
|
||||
-
|
||||
- evtchn_fifo_destroy(d);
|
||||
-}
|
||||
-
|
||||
-
|
||||
-void evtchn_destroy_final(struct domain *d)
|
||||
-{
|
||||
#if MAX_VIRT_CPUS > BITS_PER_LONG
|
||||
xfree(d->poll_mask);
|
||||
d->poll_mask = NULL;
|
@ -1,257 +0,0 @@
|
||||
# Commit de6acb78bf0e137cbe5b72cee4a35ca018d759cc
|
||||
# Date 2015-06-22 11:39:03 +0200
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
evtchn: use a per-event channel lock for sending events
|
||||
|
||||
When sending an event, use a new per-event channel lock to safely
|
||||
validate the event channel state.
|
||||
|
||||
This new lock must be held when changing event channel state. Note
|
||||
that the event channel lock must also be held when changing state from
|
||||
ECS_FREE or it will race with a concurrent get_free_port() call.
|
||||
|
||||
To avoid having to take the remote event channel locks when sending to
|
||||
an interdomain event channel, the local and remote channel locks are
|
||||
both held when binding or closing an interdomain event channel.
|
||||
|
||||
This significantly increases the number of events that can be sent
|
||||
from multiple VCPUs. But struct evtchn increases in size, reducing
|
||||
the number that fit into a single page to 64 (instead of 128).
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/xen/common/event_channel.c
|
||||
+++ b/xen/common/event_channel.c
|
||||
@@ -141,6 +141,7 @@ static struct evtchn *alloc_evtchn_bucke
|
||||
return NULL;
|
||||
}
|
||||
chn[i].port = port + i;
|
||||
+ spin_lock_init(&chn[i].lock);
|
||||
}
|
||||
return chn;
|
||||
}
|
||||
@@ -231,11 +232,15 @@ static long evtchn_alloc_unbound(evtchn_
|
||||
if ( rc )
|
||||
goto out;
|
||||
|
||||
+ spin_lock(&chn->lock);
|
||||
+
|
||||
chn->state = ECS_UNBOUND;
|
||||
if ( (chn->u.unbound.remote_domid = alloc->remote_dom) == DOMID_SELF )
|
||||
chn->u.unbound.remote_domid = current->domain->domain_id;
|
||||
evtchn_port_init(d, chn);
|
||||
|
||||
+ spin_unlock(&chn->lock);
|
||||
+
|
||||
alloc->port = port;
|
||||
|
||||
out:
|
||||
@@ -246,6 +251,28 @@ static long evtchn_alloc_unbound(evtchn_
|
||||
}
|
||||
|
||||
|
||||
+static void double_evtchn_lock(struct evtchn *lchn, struct evtchn *rchn)
|
||||
+{
|
||||
+ if ( lchn < rchn )
|
||||
+ {
|
||||
+ spin_lock(&lchn->lock);
|
||||
+ spin_lock(&rchn->lock);
|
||||
+ }
|
||||
+ else
|
||||
+ {
|
||||
+ if ( lchn != rchn )
|
||||
+ spin_lock(&rchn->lock);
|
||||
+ spin_lock(&lchn->lock);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static void double_evtchn_unlock(struct evtchn *lchn, struct evtchn *rchn)
|
||||
+{
|
||||
+ spin_unlock(&lchn->lock);
|
||||
+ if ( lchn != rchn )
|
||||
+ spin_unlock(&rchn->lock);
|
||||
+}
|
||||
+
|
||||
static long evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind)
|
||||
{
|
||||
struct evtchn *lchn, *rchn;
|
||||
@@ -288,6 +315,8 @@ static long evtchn_bind_interdomain(evtc
|
||||
if ( rc )
|
||||
goto out;
|
||||
|
||||
+ double_evtchn_lock(lchn, rchn);
|
||||
+
|
||||
lchn->u.interdomain.remote_dom = rd;
|
||||
lchn->u.interdomain.remote_port = rport;
|
||||
lchn->state = ECS_INTERDOMAIN;
|
||||
@@ -303,6 +332,8 @@ static long evtchn_bind_interdomain(evtc
|
||||
*/
|
||||
evtchn_set_pending(ld->vcpu[lchn->notify_vcpu_id], lport);
|
||||
|
||||
+ double_evtchn_unlock(lchn, rchn);
|
||||
+
|
||||
bind->local_port = lport;
|
||||
|
||||
out:
|
||||
@@ -343,11 +374,16 @@ static long evtchn_bind_virq(evtchn_bind
|
||||
ERROR_EXIT(port);
|
||||
|
||||
chn = evtchn_from_port(d, port);
|
||||
+
|
||||
+ spin_lock(&chn->lock);
|
||||
+
|
||||
chn->state = ECS_VIRQ;
|
||||
chn->notify_vcpu_id = vcpu;
|
||||
chn->u.virq = virq;
|
||||
evtchn_port_init(d, chn);
|
||||
|
||||
+ spin_unlock(&chn->lock);
|
||||
+
|
||||
v->virq_to_evtchn[virq] = bind->port = port;
|
||||
|
||||
out:
|
||||
@@ -374,10 +410,15 @@ static long evtchn_bind_ipi(evtchn_bind_
|
||||
ERROR_EXIT(port);
|
||||
|
||||
chn = evtchn_from_port(d, port);
|
||||
+
|
||||
+ spin_lock(&chn->lock);
|
||||
+
|
||||
chn->state = ECS_IPI;
|
||||
chn->notify_vcpu_id = vcpu;
|
||||
evtchn_port_init(d, chn);
|
||||
|
||||
+ spin_unlock(&chn->lock);
|
||||
+
|
||||
bind->port = port;
|
||||
|
||||
out:
|
||||
@@ -452,11 +493,15 @@ static long evtchn_bind_pirq(evtchn_bind
|
||||
goto out;
|
||||
}
|
||||
|
||||
+ spin_lock(&chn->lock);
|
||||
+
|
||||
chn->state = ECS_PIRQ;
|
||||
chn->u.pirq.irq = pirq;
|
||||
link_pirq_port(port, chn, v);
|
||||
evtchn_port_init(d, chn);
|
||||
|
||||
+ spin_unlock(&chn->lock);
|
||||
+
|
||||
bind->port = port;
|
||||
|
||||
arch_evtchn_bind_pirq(d, pirq);
|
||||
@@ -574,15 +619,24 @@ static long evtchn_close(struct domain *
|
||||
BUG_ON(chn2->state != ECS_INTERDOMAIN);
|
||||
BUG_ON(chn2->u.interdomain.remote_dom != d1);
|
||||
|
||||
+ double_evtchn_lock(chn1, chn2);
|
||||
+
|
||||
+ free_evtchn(d1, chn1);
|
||||
+
|
||||
chn2->state = ECS_UNBOUND;
|
||||
chn2->u.unbound.remote_domid = d1->domain_id;
|
||||
- break;
|
||||
+
|
||||
+ double_evtchn_unlock(chn1, chn2);
|
||||
+
|
||||
+ goto out;
|
||||
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
|
||||
+ spin_lock(&chn1->lock);
|
||||
free_evtchn(d1, chn1);
|
||||
+ spin_unlock(&chn1->lock);
|
||||
|
||||
out:
|
||||
if ( d2 != NULL )
|
||||
@@ -604,21 +658,18 @@ int evtchn_send(struct domain *d, unsign
|
||||
struct vcpu *rvcpu;
|
||||
int rport, ret = 0;
|
||||
|
||||
- spin_lock(&ld->event_lock);
|
||||
-
|
||||
- if ( unlikely(!port_is_valid(ld, lport)) )
|
||||
- {
|
||||
- spin_unlock(&ld->event_lock);
|
||||
+ if ( !port_is_valid(ld, lport) )
|
||||
return -EINVAL;
|
||||
- }
|
||||
|
||||
lchn = evtchn_from_port(ld, lport);
|
||||
|
||||
+ spin_lock(&lchn->lock);
|
||||
+
|
||||
/* Guest cannot send via a Xen-attached event channel. */
|
||||
if ( unlikely(consumer_is_xen(lchn)) )
|
||||
{
|
||||
- spin_unlock(&ld->event_lock);
|
||||
- return -EINVAL;
|
||||
+ ret = -EINVAL;
|
||||
+ goto out;
|
||||
}
|
||||
|
||||
ret = xsm_evtchn_send(XSM_HOOK, ld, lchn);
|
||||
@@ -648,7 +699,7 @@ int evtchn_send(struct domain *d, unsign
|
||||
}
|
||||
|
||||
out:
|
||||
- spin_unlock(&ld->event_lock);
|
||||
+ spin_unlock(&lchn->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -1159,11 +1210,15 @@ int alloc_unbound_xen_event_channel(
|
||||
if ( rc )
|
||||
goto out;
|
||||
|
||||
+ spin_lock(&chn->lock);
|
||||
+
|
||||
chn->state = ECS_UNBOUND;
|
||||
chn->xen_consumer = get_xen_consumer(notification_fn);
|
||||
chn->notify_vcpu_id = local_vcpu->vcpu_id;
|
||||
chn->u.unbound.remote_domid = remote_domid;
|
||||
|
||||
+ spin_unlock(&chn->lock);
|
||||
+
|
||||
out:
|
||||
spin_unlock(&d->event_lock);
|
||||
|
||||
@@ -1187,11 +1242,11 @@ void notify_via_xen_event_channel(struct
|
||||
struct domain *rd;
|
||||
int rport;
|
||||
|
||||
- spin_lock(&ld->event_lock);
|
||||
-
|
||||
ASSERT(port_is_valid(ld, lport));
|
||||
lchn = evtchn_from_port(ld, lport);
|
||||
|
||||
+ spin_lock(&lchn->lock);
|
||||
+
|
||||
if ( likely(lchn->state == ECS_INTERDOMAIN) )
|
||||
{
|
||||
ASSERT(consumer_is_xen(lchn));
|
||||
@@ -1201,7 +1256,7 @@ void notify_via_xen_event_channel(struct
|
||||
evtchn_set_pending(rd->vcpu[rchn->notify_vcpu_id], rport);
|
||||
}
|
||||
|
||||
- spin_unlock(&ld->event_lock);
|
||||
+ spin_unlock(&lchn->lock);
|
||||
}
|
||||
|
||||
void evtchn_check_pollers(struct domain *d, unsigned int port)
|
||||
--- a/xen/include/xen/sched.h
|
||||
+++ b/xen/include/xen/sched.h
|
||||
@@ -79,6 +79,7 @@ extern domid_t hardware_domid;
|
||||
|
||||
struct evtchn
|
||||
{
|
||||
+ spinlock_t lock;
|
||||
#define ECS_FREE 0 /* Channel is available for use. */
|
||||
#define ECS_RESERVED 1 /* Channel is reserved. */
|
||||
#define ECS_UNBOUND 2 /* Channel is waiting to bind to a remote domain. */
|
@ -1,27 +0,0 @@
|
||||
# Commit b58214a24231a1f2a7e09ae9cc3014eff752918b
|
||||
# Date 2015-06-22 11:39:46 +0200
|
||||
# Author David Vrabel <david.vrabel@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
evtchn: pad struct evtchn to 64 bytes
|
||||
|
||||
The number of struct evtchn in a page must be a power of two. Under
|
||||
some workloads performance is improved slightly by padding struct
|
||||
evtchn to 64 bytes (a typical cache line size), thus putting the fewer
|
||||
per-channel locks into each cache line.
|
||||
|
||||
This does not decrease the number of struct evtchn's per-page.
|
||||
|
||||
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
|
||||
Acked-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/xen/include/xen/sched.h
|
||||
+++ b/xen/include/xen/sched.h
|
||||
@@ -129,7 +129,7 @@ struct evtchn
|
||||
#endif
|
||||
} ssid;
|
||||
#endif
|
||||
-};
|
||||
+} __attribute__((aligned(64)));
|
||||
|
||||
int evtchn_init(struct domain *d); /* from domain_create */
|
||||
void evtchn_destroy(struct domain *d); /* from domain_kill */
|
@ -1,23 +0,0 @@
|
||||
# Commit 142473cfce41a565898e0fa33dc98a1f5e41abe4
|
||||
# Date 2015-06-25 14:57:04 +0200
|
||||
# Author Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/traps: avoid using current too early on boot
|
||||
|
||||
Early on boot, current has the sentinel value 0xfffff000. Blindly using it in
|
||||
show_registers() causes a nested failure and no useful information printed
|
||||
from an early crash.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/x86_64/traps.c
|
||||
+++ b/xen/arch/x86/x86_64/traps.c
|
||||
@@ -86,7 +86,7 @@ void show_registers(const struct cpu_use
|
||||
struct cpu_user_regs fault_regs = *regs;
|
||||
unsigned long fault_crs[8];
|
||||
enum context context;
|
||||
- struct vcpu *v = current;
|
||||
+ struct vcpu *v = system_state >= SYS_STATE_smp_boot ? current : NULL;
|
||||
|
||||
if ( guest_mode(regs) && has_hvm_container_vcpu(v) )
|
||||
{
|
@ -1,50 +0,0 @@
|
||||
# Commit 71bb7304e7a7a35ea6df4b0cedebc35028e4c159
|
||||
# Date 2015-06-30 15:00:54 +0100
|
||||
# Author Liang Li <liang.z.li@intel.com>
|
||||
# Committer Ian Campbell <ian.campbell@citrix.com>
|
||||
nested EPT: fix the handling of nested EPT
|
||||
|
||||
If the host EPT entry is changed, the nested EPT should be updated.
|
||||
the current code does not do this, and it's wrong.
|
||||
I have tested this patch, the L2 guest can boot and run as normal.
|
||||
|
||||
Signed-off-by: Liang Li <liang.z.li@intel.com>
|
||||
Signed-off-by: Yang Zhang <yang.z.zhang@intel.com>
|
||||
Reported-by: Tim Deegan <tim@xen.org>
|
||||
Reviewed-by: Tim Deegan <tim@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/mm/p2m-ept.c
|
||||
+++ b/xen/arch/x86/mm/p2m-ept.c
|
||||
@@ -26,6 +26,7 @@
|
||||
#include <asm/p2m.h>
|
||||
#include <asm/hvm/vmx/vmx.h>
|
||||
#include <asm/hvm/vmx/vmcs.h>
|
||||
+#include <asm/hvm/nestedhvm.h>
|
||||
#include <xen/iommu.h>
|
||||
#include <asm/mtrr.h>
|
||||
#include <asm/hvm/cacheattr.h>
|
||||
@@ -1040,6 +1041,9 @@ void ept_sync_domain(struct p2m_domain *
|
||||
|
||||
ASSERT(local_irq_is_enabled());
|
||||
|
||||
+ if ( nestedhvm_enabled(d) && !p2m_is_nestedp2m(p2m) )
|
||||
+ p2m_flush_nestedp2m(d);
|
||||
+
|
||||
/*
|
||||
* Flush active cpus synchronously. Flush others the next time this domain
|
||||
* is scheduled onto them. We accept the race of other CPUs adding to
|
||||
--- a/xen/arch/x86/mm/p2m.c
|
||||
+++ b/xen/arch/x86/mm/p2m.c
|
||||
@@ -1713,6 +1713,12 @@ p2m_flush_table(struct p2m_domain *p2m)
|
||||
ASSERT(page_list_empty(&p2m->pod.super));
|
||||
ASSERT(page_list_empty(&p2m->pod.single));
|
||||
|
||||
+ if ( p2m->np2m_base == P2M_BASE_EADDR )
|
||||
+ {
|
||||
+ p2m_unlock(p2m);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
/* This is no longer a valid nested p2m for any address space */
|
||||
p2m->np2m_base = P2M_BASE_EADDR;
|
||||
|
@ -1,64 +0,0 @@
|
||||
# Commit e4e9d2d4e76bd8fe229c124bd57fc6ba824271b3
|
||||
# Date 2015-07-07 11:37:26 +0200
|
||||
# Author Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/p2m-ept: don't unmap the EPT pagetable while it is still in use
|
||||
|
||||
The call to iommu_pte_flush() between the two hunks uses &ept_entry->epte
|
||||
which is a pointer into the mapped page.
|
||||
|
||||
It is eventually passed to `clflush` instruction which will suffer a pagefault
|
||||
if the virtual mapping has fallen out of the TLB.
|
||||
|
||||
(XEN) ----[ Xen-4.5.0-xs102594-d x86_64 debug=y Not tainted ]----
|
||||
(XEN) CPU: 7
|
||||
(XEN) RIP: e008:[<ffff82d0801572f0>] cacheline_flush+0x4/0x9
|
||||
<snip>
|
||||
(XEN) Xen call trace:
|
||||
(XEN) [<ffff82d0801572f0>] cacheline_flush+0x4/0x9
|
||||
(XEN) [<ffff82d08014ffff>] __iommu_flush_cache+0x4a/0x6a
|
||||
(XEN) [<ffff82d0801532e2>] iommu_pte_flush+0x2b/0xd5
|
||||
(XEN) [<ffff82d0801f909a>] ept_set_entry+0x4bc/0x61f
|
||||
(XEN) [<ffff82d0801f0c25>] p2m_set_entry+0xd1/0x112
|
||||
(XEN) [<ffff82d0801f25b1>] clear_mmio_p2m_entry+0x1a0/0x200
|
||||
(XEN) [<ffff82d0801f4aac>] unmap_mmio_regions+0x49/0x73
|
||||
(XEN) [<ffff82d080106292>] do_domctl+0x15bd/0x1edb
|
||||
(XEN) [<ffff82d080234fcb>] syscall_enter+0xeb/0x145
|
||||
(XEN)
|
||||
(XEN) Pagetable walk from ffff820040004ae0:
|
||||
(XEN) L4[0x104] = 00000008668a5063 ffffffffffffffff
|
||||
(XEN) L3[0x001] = 00000008668a3063 ffffffffffffffff
|
||||
(XEN) L2[0x000] = 000000086689c063 ffffffffffffffff
|
||||
(XEN) L1[0x004] = 000000056f078063 000000000007f678
|
||||
(XEN)
|
||||
(XEN) ****************************************
|
||||
(XEN) Panic on CPU 7:
|
||||
(XEN) FATAL PAGE FAULT
|
||||
(XEN) [error_code=0000]
|
||||
(XEN) Faulting linear address: ffff820040004ae0
|
||||
(XEN) ****************************************
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: George Dunlap <george.dunlap@eu.citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/xen/arch/x86/mm/p2m-ept.c
|
||||
+++ b/xen/arch/x86/mm/p2m-ept.c
|
||||
@@ -764,8 +764,6 @@ ept_set_entry(struct p2m_domain *p2m, un
|
||||
p2m->max_mapped_pfn = gfn + (1UL << order) - 1;
|
||||
|
||||
out:
|
||||
- unmap_domain_page(table);
|
||||
-
|
||||
if ( needs_sync != sync_off )
|
||||
ept_sync_domain(p2m);
|
||||
|
||||
@@ -788,6 +786,8 @@ out:
|
||||
}
|
||||
}
|
||||
|
||||
+ unmap_domain_page(table);
|
||||
+
|
||||
/* Release the old intermediate tables, if any. This has to be the
|
||||
last thing we do, after the ept_sync_domain() and removal
|
||||
from the iommu tables, so as to avoid a potential
|
@ -1,88 +0,0 @@
|
||||
# Commit 8022b05284dea80e24813d03180788ec7277a0bd
|
||||
# Date 2015-07-07 14:29:39 +0200
|
||||
# Author Dario Faggioli <dario.faggioli@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86 / cpupool: clear the proper cpu_valid bit on pCPU teardown
|
||||
|
||||
In fact, when a pCPU goes down, we want to clear its
|
||||
bit in the correct cpupool's valid mask, rather than
|
||||
always in cpupool0's one.
|
||||
|
||||
Before this commit, all the pCPUs in the non-default
|
||||
pool(s) will be considered immediately valid, during
|
||||
system resume, even the one that have not been brought
|
||||
up yet. As a result, the (Credit1) scheduler will attempt
|
||||
to run its load balancing logic on them, causing the
|
||||
following Oops:
|
||||
|
||||
# xl cpupool-cpu-remove Pool-0 8-15
|
||||
# xl cpupool-create name=\"Pool-1\"
|
||||
# xl cpupool-cpu-add Pool-1 8-15
|
||||
--> suspend
|
||||
--> resume
|
||||
(XEN) ----[ Xen-4.6-unstable x86_64 debug=y Tainted: C ]----
|
||||
(XEN) CPU: 8
|
||||
(XEN) RIP: e008:[<ffff82d080123078>] csched_schedule+0x4be/0xb97
|
||||
(XEN) RFLAGS: 0000000000010087 CONTEXT: hypervisor
|
||||
(XEN) rax: 80007d2f7fccb780 rbx: 0000000000000009 rcx: 0000000000000000
|
||||
(XEN) rdx: ffff82d08031ed40 rsi: ffff82d080334980 rdi: 0000000000000000
|
||||
(XEN) rbp: ffff83010000fe20 rsp: ffff83010000fd40 r8: 0000000000000004
|
||||
(XEN) r9: 0000ffff0000ffff r10: 00ff00ff00ff00ff r11: 0f0f0f0f0f0f0f0f
|
||||
(XEN) r12: ffff8303191ea870 r13: ffff8303226aadf0 r14: 0000000000000009
|
||||
(XEN) r15: 0000000000000008 cr0: 000000008005003b cr4: 00000000000026f0
|
||||
(XEN) cr3: 00000000dba9d000 cr2: 0000000000000000
|
||||
(XEN) ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: 0000 cs: e008
|
||||
(XEN) ... ... ...
|
||||
(XEN) Xen call trace:
|
||||
(XEN) [<ffff82d080123078>] csched_schedule+0x4be/0xb97
|
||||
(XEN) [<ffff82d08012c732>] schedule+0x12a/0x63c
|
||||
(XEN) [<ffff82d08012f8c8>] __do_softirq+0x82/0x8d
|
||||
(XEN) [<ffff82d08012f920>] do_softirq+0x13/0x15
|
||||
(XEN) [<ffff82d080164791>] idle_loop+0x5b/0x6b
|
||||
(XEN)
|
||||
(XEN) ****************************************
|
||||
(XEN) Panic on CPU 8:
|
||||
(XEN) GENERAL PROTECTION FAULT
|
||||
(XEN) [error_code=0000]
|
||||
(XEN) ****************************************
|
||||
|
||||
The reason why the error is a #GP fault is that, without
|
||||
this commit, we try to access the per-cpu area of a not
|
||||
yet allocated and initialized pCPU.
|
||||
In fact, %rax, which is what is used as pointer, is
|
||||
80007d2f7fccb780, and we also have this:
|
||||
|
||||
#define INVALID_PERCPU_AREA (0x8000000000000000L - (long)__per_cpu_start)
|
||||
|
||||
Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
|
||||
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Juergen Gross <jgross@suse.com>
|
||||
|
||||
--- a/xen/arch/x86/smpboot.c
|
||||
+++ b/xen/arch/x86/smpboot.c
|
||||
@@ -816,7 +816,6 @@ void __cpu_disable(void)
|
||||
remove_siblinginfo(cpu);
|
||||
|
||||
/* It's now safe to remove this processor from the online map */
|
||||
- cpumask_clear_cpu(cpu, cpupool0->cpu_valid);
|
||||
cpumask_clear_cpu(cpu, &cpu_online_map);
|
||||
fixup_irqs();
|
||||
|
||||
--- a/xen/common/cpupool.c
|
||||
+++ b/xen/common/cpupool.c
|
||||
@@ -529,6 +529,7 @@ static int cpupool_cpu_remove(unsigned i
|
||||
if ( cpumask_test_cpu(cpu, (*c)->cpu_valid ) )
|
||||
{
|
||||
cpumask_set_cpu(cpu, (*c)->cpu_suspended);
|
||||
+ cpumask_clear_cpu(cpu, (*c)->cpu_valid);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -551,6 +552,7 @@ static int cpupool_cpu_remove(unsigned i
|
||||
* If we are not suspending, we are hot-unplugging cpu, and that is
|
||||
* allowed only for CPUs in pool0.
|
||||
*/
|
||||
+ cpumask_clear_cpu(cpu, cpupool0->cpu_valid);
|
||||
ret = 0;
|
||||
}
|
||||
|
@ -1,141 +0,0 @@
|
||||
# Commit 02ea5031825d984d52eb9a982b8457e3434137f0
|
||||
# Date 2015-07-07 14:30:06 +0200
|
||||
# Author Dario Faggioli <dario.faggioli@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
credit1: properly deal with pCPUs not in any cpupool
|
||||
|
||||
Ideally, the pCPUs that are 'free', i.e., not assigned
|
||||
to any cpupool, should not be considred by the scheduler
|
||||
for load balancing or anything. In Credit1, we fail at
|
||||
this, because of how we use cpupool_scheduler_cpumask().
|
||||
In fact, for a free pCPU, cpupool_scheduler_cpumask()
|
||||
returns a pointer to cpupool_free_cpus, and hence, near
|
||||
the top of csched_load_balance():
|
||||
|
||||
if ( unlikely(!cpumask_test_cpu(cpu, online)) )
|
||||
goto out;
|
||||
|
||||
is false (the pCPU _is_ free!), and we therefore do not
|
||||
jump to the end right away, as we should. This, causes
|
||||
the following splat when resuming from ACPI S3 with
|
||||
pCPUs not assigned to any pool:
|
||||
|
||||
(XEN) ----[ Xen-4.6-unstable x86_64 debug=y Tainted: C ]----
|
||||
(XEN) ... ... ...
|
||||
(XEN) Xen call trace:
|
||||
(XEN) [<ffff82d080122eaa>] csched_load_balance+0x213/0x794
|
||||
(XEN) [<ffff82d08012374c>] csched_schedule+0x321/0x452
|
||||
(XEN) [<ffff82d08012c85e>] schedule+0x12a/0x63c
|
||||
(XEN) [<ffff82d08012fa09>] __do_softirq+0x82/0x8d
|
||||
(XEN) [<ffff82d08012fa61>] do_softirq+0x13/0x15
|
||||
(XEN) [<ffff82d080164780>] idle_loop+0x5b/0x6b
|
||||
(XEN)
|
||||
(XEN)
|
||||
(XEN) ****************************************
|
||||
(XEN) Panic on CPU 8:
|
||||
(XEN) GENERAL PROTECTION FAULT
|
||||
(XEN) [error_code=0000]
|
||||
(XEN) ****************************************
|
||||
|
||||
The cure is:
|
||||
* use cpupool_online_cpumask(), as a better guard to the
|
||||
case when the cpu is being offlined;
|
||||
* explicitly check whether the cpu is free.
|
||||
|
||||
SEDF is in a similar situation, so fix it too.
|
||||
|
||||
Still in Credit1, we must make sure that free (or offline)
|
||||
CPUs are not considered "ticklable". Not doing so would impair
|
||||
the load balancing algorithm, making the scheduler think that
|
||||
it is possible to 'ask' the pCPU to pick up some work, while
|
||||
in reallity, that will never happen! Evidence of such behavior
|
||||
is shown in this trace:
|
||||
|
||||
Name CPU list
|
||||
Pool-0 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
|
||||
|
||||
0.112998198 | ||.|| -|x||-|- d0v0 runstate_change d0v4 offline->runnable
|
||||
] 0.112998198 | ||.|| -|x||-|- d0v0 22006(2:2:6) 1 [ f ]
|
||||
] 0.112999612 | ||.|| -|x||-|- d0v0 28004(2:8:4) 2 [ 0 4 ]
|
||||
0.113003387 | ||.|| -||||-|x d32767v15 runstate_continue d32767v15 running->running
|
||||
|
||||
where "22006(2:2:6) 1 [ f ]" means that pCPU 15, which is
|
||||
free from any pool, is tickled.
|
||||
|
||||
The cure, in this case, is to filter out the free pCPUs,
|
||||
within __runq_tickle().
|
||||
|
||||
Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
|
||||
Acked-by: Juergen Gross <jgross@suse.com>
|
||||
Reviewed-by: George Dunlap <george.dunlap@eu.citrix.com>
|
||||
|
||||
--- a/xen/common/sched_credit.c
|
||||
+++ b/xen/common/sched_credit.c
|
||||
@@ -350,12 +350,17 @@ __runq_tickle(unsigned int cpu, struct c
|
||||
{
|
||||
struct csched_vcpu * const cur = CSCHED_VCPU(curr_on_cpu(cpu));
|
||||
struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu));
|
||||
- cpumask_t mask, idle_mask;
|
||||
+ cpumask_t mask, idle_mask, *online;
|
||||
int balance_step, idlers_empty;
|
||||
|
||||
ASSERT(cur);
|
||||
cpumask_clear(&mask);
|
||||
- idlers_empty = cpumask_empty(prv->idlers);
|
||||
+
|
||||
+ /* cpu is vc->processor, so it must be in a cpupool. */
|
||||
+ ASSERT(per_cpu(cpupool, cpu) != NULL);
|
||||
+ online = cpupool_online_cpumask(per_cpu(cpupool, cpu));
|
||||
+ cpumask_and(&idle_mask, prv->idlers, online);
|
||||
+ idlers_empty = cpumask_empty(&idle_mask);
|
||||
|
||||
|
||||
/*
|
||||
@@ -392,8 +397,8 @@ __runq_tickle(unsigned int cpu, struct c
|
||||
/* Are there idlers suitable for new (for this balance step)? */
|
||||
csched_balance_cpumask(new->vcpu, balance_step,
|
||||
csched_balance_mask);
|
||||
- cpumask_and(&idle_mask, prv->idlers, csched_balance_mask);
|
||||
- new_idlers_empty = cpumask_empty(&idle_mask);
|
||||
+ cpumask_and(csched_balance_mask, csched_balance_mask, &idle_mask);
|
||||
+ new_idlers_empty = cpumask_empty(csched_balance_mask);
|
||||
|
||||
/*
|
||||
* Let's not be too harsh! If there aren't idlers suitable
|
||||
@@ -1494,6 +1499,7 @@ static struct csched_vcpu *
|
||||
csched_load_balance(struct csched_private *prv, int cpu,
|
||||
struct csched_vcpu *snext, bool_t *stolen)
|
||||
{
|
||||
+ struct cpupool *c = per_cpu(cpupool, cpu);
|
||||
struct csched_vcpu *speer;
|
||||
cpumask_t workers;
|
||||
cpumask_t *online;
|
||||
@@ -1501,10 +1507,13 @@ csched_load_balance(struct csched_privat
|
||||
int node = cpu_to_node(cpu);
|
||||
|
||||
BUG_ON( cpu != snext->vcpu->processor );
|
||||
- online = cpupool_scheduler_cpumask(per_cpu(cpupool, cpu));
|
||||
+ online = cpupool_online_cpumask(c);
|
||||
|
||||
- /* If this CPU is going offline we shouldn't steal work. */
|
||||
- if ( unlikely(!cpumask_test_cpu(cpu, online)) )
|
||||
+ /*
|
||||
+ * If this CPU is going offline, or is not (yet) part of any cpupool
|
||||
+ * (as it happens, e.g., during cpu bringup), we shouldn't steal work.
|
||||
+ */
|
||||
+ if ( unlikely(!cpumask_test_cpu(cpu, online) || c == NULL) )
|
||||
goto out;
|
||||
|
||||
if ( snext->pri == CSCHED_PRI_IDLE )
|
||||
--- a/xen/common/sched_sedf.c
|
||||
+++ b/xen/common/sched_sedf.c
|
||||
@@ -791,7 +791,8 @@ static struct task_slice sedf_do_schedul
|
||||
if ( tasklet_work_scheduled ||
|
||||
(list_empty(runq) && list_empty(waitq)) ||
|
||||
unlikely(!cpumask_test_cpu(cpu,
|
||||
- cpupool_scheduler_cpumask(per_cpu(cpupool, cpu)))) )
|
||||
+ cpupool_online_cpumask(per_cpu(cpupool, cpu))) ||
|
||||
+ per_cpu(cpupool, cpu) == NULL) )
|
||||
{
|
||||
ret.task = IDLETASK(cpu);
|
||||
ret.time = SECONDS(1);
|
@ -1,68 +0,0 @@
|
||||
# Commit bbbe7e7157a964c485fb861765be291734676932
|
||||
# Date 2015-07-07 14:39:27 +0200
|
||||
# Author Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/hvmloader: avoid data corruption with xenstore reads/writes
|
||||
|
||||
The functions ring_read and ring_write() have logic to try and deal with
|
||||
partial reads and writes.
|
||||
|
||||
However, in all cases where the "while (len)" loop executed twice, data
|
||||
corruption would occur as the second memcpy() starts from the beginning of
|
||||
"data" again, rather than from where it got to.
|
||||
|
||||
This bug manifested itself as protocol corruption when a reply header crossed
|
||||
the first wrap of the response ring. However, similar corruption would also
|
||||
occur if hvmloader observed xenstored performing partial writes of the block
|
||||
in question, or if hvmloader had to wait for xenstored to make space in either
|
||||
ring.
|
||||
|
||||
Reported-by: Adam Kucia <djexit@o2.pl>
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/tools/firmware/hvmloader/xenbus.c
|
||||
+++ b/tools/firmware/hvmloader/xenbus.c
|
||||
@@ -105,7 +105,7 @@ void xenbus_shutdown(void)
|
||||
/* Helper functions: copy data in and out of the ring */
|
||||
static void ring_write(const char *data, uint32_t len)
|
||||
{
|
||||
- uint32_t part;
|
||||
+ uint32_t part, done = 0;
|
||||
|
||||
ASSERT(len <= XENSTORE_PAYLOAD_MAX);
|
||||
|
||||
@@ -122,16 +122,18 @@ static void ring_write(const char *data,
|
||||
if ( part > len )
|
||||
part = len;
|
||||
|
||||
- memcpy(rings->req + MASK_XENSTORE_IDX(rings->req_prod), data, part);
|
||||
+ memcpy(rings->req + MASK_XENSTORE_IDX(rings->req_prod),
|
||||
+ data + done, part);
|
||||
barrier(); /* = wmb before prod write, rmb before next cons read */
|
||||
rings->req_prod += part;
|
||||
len -= part;
|
||||
+ done += part;
|
||||
}
|
||||
}
|
||||
|
||||
static void ring_read(char *data, uint32_t len)
|
||||
{
|
||||
- uint32_t part;
|
||||
+ uint32_t part, done = 0;
|
||||
|
||||
ASSERT(len <= XENSTORE_PAYLOAD_MAX);
|
||||
|
||||
@@ -148,10 +150,12 @@ static void ring_read(char *data, uint32
|
||||
if ( part > len )
|
||||
part = len;
|
||||
|
||||
- memcpy(data, rings->rsp + MASK_XENSTORE_IDX(rings->rsp_cons), part);
|
||||
+ memcpy(data + done,
|
||||
+ rings->rsp + MASK_XENSTORE_IDX(rings->rsp_cons), part);
|
||||
barrier(); /* = wmb before cons write, rmb before next prod read */
|
||||
rings->rsp_cons += part;
|
||||
len -= part;
|
||||
+ done += part;
|
||||
}
|
||||
}
|
||||
|
@ -1,102 +0,0 @@
|
||||
# Commit 39c6664a0e6e1b4ed80660d545dff34ce41bee31
|
||||
# Date 2015-07-07 15:10:45 +0100
|
||||
# Author Ian Campbell <ian.campbell@citrix.com>
|
||||
# Committer Ian Campbell <ian.campbell@citrix.com>
|
||||
xen: earlycpio: Pull in latest linux earlycpio.[ch]
|
||||
|
||||
AFAICT our current version does not correspond to any version in the
|
||||
Linux history. This commit resynchronised to the state in Linux
|
||||
commit 598bae70c2a8e35c8d39b610cca2b32afcf047af.
|
||||
|
||||
Differences from upstream: find_cpio_data is __init, printk instead of
|
||||
pr_*.
|
||||
|
||||
This appears to fix Debian bug #785187. "Appears" because my test box
|
||||
happens to be AMD and the issue is that the (valid) cpio generated by
|
||||
the Intel ucode is not liked by the old Xen code. I've tested by
|
||||
hacking the hypervisor to look for the Intel path.
|
||||
|
||||
Reported-by: Stephan Seitz <stse+debianbugs@fsing.rootsland.net>
|
||||
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
|
||||
Cc: Jan Beulich <jbeulich@suse.com>
|
||||
Cc: Stephan Seitz <stse+debianbugs@fsing.rootsland.net>
|
||||
Cc: 785187@bugs.debian.org
|
||||
Acked-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/xen/common/earlycpio.c
|
||||
+++ b/xen/common/earlycpio.c
|
||||
@@ -54,25 +54,26 @@ enum cpio_fields {
|
||||
|
||||
/**
|
||||
* cpio_data find_cpio_data - Search for files in an uncompressed cpio
|
||||
- * @path: The directory to search for, including a slash at the end
|
||||
- * @data: Pointer to the the cpio archive or a header inside
|
||||
- * @len: Remaining length of the cpio based on data pointer
|
||||
- * @offset: When a matching file is found, this is the offset to the
|
||||
- * beginning of the cpio. It can be used to iterate through
|
||||
- * the cpio to find all files inside of a directory path
|
||||
+ * @path: The directory to search for, including a slash at the end
|
||||
+ * @data: Pointer to the the cpio archive or a header inside
|
||||
+ * @len: Remaining length of the cpio based on data pointer
|
||||
+ * @nextoff: When a matching file is found, this is the offset from the
|
||||
+ * beginning of the cpio to the beginning of the next file, not the
|
||||
+ * matching file itself. It can be used to iterate through the cpio
|
||||
+ * to find all files inside of a directory path.
|
||||
*
|
||||
- * @return: struct cpio_data containing the address, length and
|
||||
- * filename (with the directory path cut off) of the found file.
|
||||
- * If you search for a filename and not for files in a directory,
|
||||
- * pass the absolute path of the filename in the cpio and make sure
|
||||
- * the match returned an empty filename string.
|
||||
+ * @return: struct cpio_data containing the address, length and
|
||||
+ * filename (with the directory path cut off) of the found file.
|
||||
+ * If you search for a filename and not for files in a directory,
|
||||
+ * pass the absolute path of the filename in the cpio and make sure
|
||||
+ * the match returned an empty filename string.
|
||||
*/
|
||||
|
||||
struct cpio_data __init find_cpio_data(const char *path, void *data,
|
||||
- size_t len, long *offset)
|
||||
+ size_t len, long *nextoff)
|
||||
{
|
||||
const size_t cpio_header_len = 8*C_NFIELDS - 2;
|
||||
- struct cpio_data cd = { NULL, 0 };
|
||||
+ struct cpio_data cd = { NULL, 0, "" };
|
||||
const char *p, *dptr, *nptr;
|
||||
unsigned int ch[C_NFIELDS], *chp, v;
|
||||
unsigned char c, x;
|
||||
@@ -129,17 +130,17 @@ struct cpio_data __init find_cpio_data(c
|
||||
if ((ch[C_MODE] & 0170000) == 0100000 &&
|
||||
ch[C_NAMESIZE] >= mypathsize &&
|
||||
!memcmp(p, path, mypathsize)) {
|
||||
- *offset = (long)nptr - (long)data;
|
||||
+ *nextoff = (long)nptr - (long)data;
|
||||
if (ch[C_NAMESIZE] - mypathsize >= MAX_CPIO_FILE_NAME) {
|
||||
printk(
|
||||
"File %s exceeding MAX_CPIO_FILE_NAME [%d]\n",
|
||||
p, MAX_CPIO_FILE_NAME);
|
||||
}
|
||||
- if (ch[C_NAMESIZE] - 1 /* includes \0 */ == mypathsize) {
|
||||
- cd.data = (void *)dptr;
|
||||
- cd.size = ch[C_FILESIZE];
|
||||
- return cd; /* Found it! */
|
||||
- }
|
||||
+ strlcpy(cd.name, p + mypathsize, MAX_CPIO_FILE_NAME);
|
||||
+
|
||||
+ cd.data = (void *)dptr;
|
||||
+ cd.size = ch[C_FILESIZE];
|
||||
+ return cd; /* Found it! */
|
||||
}
|
||||
len -= (nptr - p);
|
||||
p = nptr;
|
||||
--- a/xen/include/xen/earlycpio.h
|
||||
+++ b/xen/include/xen/earlycpio.h
|
||||
@@ -6,6 +6,7 @@
|
||||
struct cpio_data {
|
||||
void *data;
|
||||
size_t size;
|
||||
+ char name[MAX_CPIO_FILE_NAME];
|
||||
};
|
||||
|
||||
struct cpio_data find_cpio_data(const char *path, void *data, size_t len,
|
@ -1,37 +0,0 @@
|
||||
Subject: xl: correct handling of extra_config in main_cpupoolcreate
|
||||
From: Wei Liu wei.liu2@citrix.com Tue Jul 14 17:41:10 2015 +0100
|
||||
Date: Wed Jul 15 10:58:08 2015 +0100:
|
||||
Git: 705c9e12426cba82804cb578fc70785281655d94
|
||||
|
||||
Don't dereference extra_config if it's NULL. Don't leak extra_config in
|
||||
the end.
|
||||
|
||||
Also fixed a typo in error string while I was there.
|
||||
|
||||
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
|
||||
Acked-by: Ian Jackson <ian.jackson@eu.citrix.com>
|
||||
|
||||
Index: xen-4.5.1-testing/tools/libxl/xl_cmdimpl.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/libxl/xl_cmdimpl.c
|
||||
+++ xen-4.5.1-testing/tools/libxl/xl_cmdimpl.c
|
||||
@@ -7085,9 +7085,9 @@ int main_cpupoolcreate(int argc, char **
|
||||
else
|
||||
config_src="command line";
|
||||
|
||||
- if (strlen(extra_config)) {
|
||||
+ if (extra_config && strlen(extra_config)) {
|
||||
if (config_len > INT_MAX - (strlen(extra_config) + 2)) {
|
||||
- fprintf(stderr, "Failed to attach extra configration\n");
|
||||
+ fprintf(stderr, "Failed to attach extra configuration\n");
|
||||
goto out;
|
||||
}
|
||||
config_data = xrealloc(config_data,
|
||||
@@ -7211,6 +7211,7 @@ out_cfg:
|
||||
out:
|
||||
free(name);
|
||||
free(config_data);
|
||||
+ free(extra_config);
|
||||
return rc;
|
||||
}
|
||||
|
@ -1,24 +0,0 @@
|
||||
# Commit b1c780cd315eb4db06be3bbb5c6d80b1cabd27a9
|
||||
# Date 2015-07-15 16:11:42 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
make rangeset_report_ranges() report all ranges
|
||||
|
||||
find_range() returns NULL when s is below the lowest range, so we have
|
||||
to use first_range() here (which is as good performance wise), or else
|
||||
no range gets reported at all in that case.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
|
||||
--- a/xen/common/rangeset.c
|
||||
+++ b/xen/common/rangeset.c
|
||||
@@ -289,7 +289,7 @@ int rangeset_report_ranges(
|
||||
|
||||
read_lock(&r->lock);
|
||||
|
||||
- for ( x = find_range(r, s); x && (x->s <= e) && !rc; x = next_range(r, x) )
|
||||
+ for ( x = first_range(r); x && (x->s <= e) && !rc; x = next_range(r, x) )
|
||||
if ( x->e >= s )
|
||||
rc = cb(max(x->s, s), min(x->e, e), ctxt);
|
||||
|
@ -1,135 +0,0 @@
|
||||
# Commit a8bc99b981c5ad773bd646f5986e616d26fb94d7
|
||||
# Date 2015-07-16 11:50:07 +0200
|
||||
# Author Elena Ufimtseva <elena.ufimtseva@oracle.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
dmar: device scope mem leak fix
|
||||
|
||||
Release memory allocated for scope.devices dmar units on various
|
||||
failure paths and when disabling dmar. Set device count after
|
||||
sucessfull memory allocation, not before, in device scope parsing function.
|
||||
|
||||
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Yang Zhang <yang.z.zhang@intel.com>
|
||||
|
||||
# Commit 132231d10343608faf5892785a08acc500326d04
|
||||
# Date 2015-07-16 15:23:37 +0200
|
||||
# Author Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
dmar: fix double free in error paths following c/s a8bc99b
|
||||
|
||||
Several error paths would end up freeing scope->devices twice.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/xen/drivers/passthrough/vtd/dmar.c
|
||||
+++ b/xen/drivers/passthrough/vtd/dmar.c
|
||||
@@ -80,6 +80,16 @@ static int __init acpi_register_rmrr_uni
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static void scope_devices_free(struct dmar_scope *scope)
|
||||
+{
|
||||
+ if ( !scope )
|
||||
+ return;
|
||||
+
|
||||
+ scope->devices_cnt = 0;
|
||||
+ xfree(scope->devices);
|
||||
+ scope->devices = NULL;
|
||||
+}
|
||||
+
|
||||
static void __init disable_all_dmar_units(void)
|
||||
{
|
||||
struct acpi_drhd_unit *drhd, *_drhd;
|
||||
@@ -89,16 +99,19 @@ static void __init disable_all_dmar_unit
|
||||
list_for_each_entry_safe ( drhd, _drhd, &acpi_drhd_units, list )
|
||||
{
|
||||
list_del(&drhd->list);
|
||||
+ scope_devices_free(&drhd->scope);
|
||||
xfree(drhd);
|
||||
}
|
||||
list_for_each_entry_safe ( rmrr, _rmrr, &acpi_rmrr_units, list )
|
||||
{
|
||||
list_del(&rmrr->list);
|
||||
+ scope_devices_free(&rmrr->scope);
|
||||
xfree(rmrr);
|
||||
}
|
||||
list_for_each_entry_safe ( atsr, _atsr, &acpi_atsr_units, list )
|
||||
{
|
||||
list_del(&atsr->list);
|
||||
+ scope_devices_free(&atsr->scope);
|
||||
xfree(atsr);
|
||||
}
|
||||
}
|
||||
@@ -317,13 +330,13 @@ static int __init acpi_parse_dev_scope(
|
||||
if ( (cnt = scope_device_count(start, end)) < 0 )
|
||||
return cnt;
|
||||
|
||||
- scope->devices_cnt = cnt;
|
||||
if ( cnt > 0 )
|
||||
{
|
||||
scope->devices = xzalloc_array(u16, cnt);
|
||||
if ( !scope->devices )
|
||||
return -ENOMEM;
|
||||
}
|
||||
+ scope->devices_cnt = cnt;
|
||||
|
||||
while ( start < end )
|
||||
{
|
||||
@@ -426,7 +439,7 @@ static int __init acpi_parse_dev_scope(
|
||||
|
||||
out:
|
||||
if ( ret )
|
||||
- xfree(scope->devices);
|
||||
+ scope_devices_free(scope);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -541,6 +554,7 @@ acpi_parse_one_drhd(struct acpi_dmar_hea
|
||||
" Workaround BIOS bug: ignore the DRHD due to all "
|
||||
"devices under its scope are not PCI discoverable!\n");
|
||||
|
||||
+ scope_devices_free(&dmaru->scope);
|
||||
iommu_free(dmaru);
|
||||
xfree(dmaru);
|
||||
}
|
||||
@@ -561,9 +575,11 @@ acpi_parse_one_drhd(struct acpi_dmar_hea
|
||||
out:
|
||||
if ( ret )
|
||||
{
|
||||
+ scope_devices_free(&dmaru->scope);
|
||||
iommu_free(dmaru);
|
||||
xfree(dmaru);
|
||||
}
|
||||
+
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -657,6 +673,7 @@ acpi_parse_one_rmrr(struct acpi_dmar_hea
|
||||
" Ignore the RMRR (%"PRIx64", %"PRIx64") due to "
|
||||
"devices under its scope are not PCI discoverable!\n",
|
||||
rmrru->base_address, rmrru->end_address);
|
||||
+ scope_devices_free(&rmrru->scope);
|
||||
xfree(rmrru);
|
||||
}
|
||||
else if ( base_addr > end_addr )
|
||||
@@ -664,6 +681,7 @@ acpi_parse_one_rmrr(struct acpi_dmar_hea
|
||||
dprintk(XENLOG_WARNING VTDPREFIX,
|
||||
" The RMRR (%"PRIx64", %"PRIx64") is incorrect!\n",
|
||||
rmrru->base_address, rmrru->end_address);
|
||||
+ scope_devices_free(&rmrru->scope);
|
||||
xfree(rmrru);
|
||||
ret = -EFAULT;
|
||||
}
|
||||
@@ -726,7 +744,10 @@ acpi_parse_one_atsr(struct acpi_dmar_hea
|
||||
}
|
||||
|
||||
if ( ret )
|
||||
+ {
|
||||
+ scope_devices_free(&atsru->scope);
|
||||
xfree(atsru);
|
||||
+ }
|
||||
else
|
||||
acpi_register_atsr_unit(atsru);
|
||||
return ret;
|
@ -1,120 +0,0 @@
|
||||
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
|
||||
|
||||
# Commit a88b72fddd046a0978242411276861039ec99ad0
|
||||
# Date 2015-07-23 10:13:12 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/PCI: add config space abstract write intercept logic
|
||||
|
||||
This is to be used by MSI code, and later to also be hooked up to
|
||||
MMCFG accesses by Dom0.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/msi.c
|
||||
+++ b/xen/arch/x86/msi.c
|
||||
@@ -1108,6 +1108,12 @@ void pci_cleanup_msi(struct pci_dev *pde
|
||||
msi_free_irqs(pdev);
|
||||
}
|
||||
|
||||
+int pci_msi_conf_write_intercept(struct pci_dev *pdev, unsigned int reg,
|
||||
+ unsigned int size, uint32_t *data)
|
||||
+{
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
int pci_restore_msi_state(struct pci_dev *pdev)
|
||||
{
|
||||
unsigned long flags;
|
||||
--- a/xen/arch/x86/pci.c
|
||||
+++ b/xen/arch/x86/pci.c
|
||||
@@ -67,3 +67,28 @@ void pci_conf_write(uint32_t cf8, uint8_
|
||||
|
||||
spin_unlock_irqrestore(&pci_config_lock, flags);
|
||||
}
|
||||
+
|
||||
+int pci_conf_write_intercept(unsigned int seg, unsigned int bdf,
|
||||
+ unsigned int reg, unsigned int size,
|
||||
+ uint32_t *data)
|
||||
+{
|
||||
+ struct pci_dev *pdev;
|
||||
+ int rc = 0;
|
||||
+
|
||||
+ /*
|
||||
+ * Avoid expensive operations when no hook is going to do anything
|
||||
+ * for the access anyway.
|
||||
+ */
|
||||
+ if ( reg < 64 || reg >= 256 )
|
||||
+ return 0;
|
||||
+
|
||||
+ spin_lock(&pcidevs_lock);
|
||||
+
|
||||
+ pdev = pci_get_pdev(seg, PCI_BUS(bdf), PCI_DEVFN2(bdf));
|
||||
+ if ( pdev )
|
||||
+ rc = pci_msi_conf_write_intercept(pdev, reg, size, data);
|
||||
+
|
||||
+ spin_unlock(&pcidevs_lock);
|
||||
+
|
||||
+ return rc;
|
||||
+}
|
||||
--- a/xen/arch/x86/traps.c
|
||||
+++ b/xen/arch/x86/traps.c
|
||||
@@ -1708,8 +1708,8 @@ static int admin_io_okay(
|
||||
return ioports_access_permitted(v->domain, port, port + bytes - 1);
|
||||
}
|
||||
|
||||
-static bool_t pci_cfg_ok(struct domain *currd, bool_t write,
|
||||
- unsigned int start, unsigned int size)
|
||||
+static bool_t pci_cfg_ok(struct domain *currd, unsigned int start,
|
||||
+ unsigned int size, uint32_t *write)
|
||||
{
|
||||
uint32_t machine_bdf;
|
||||
|
||||
@@ -1741,8 +1741,12 @@ static bool_t pci_cfg_ok(struct domain *
|
||||
start |= CF8_ADDR_HI(currd->arch.pci_cf8);
|
||||
}
|
||||
|
||||
- return !xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf,
|
||||
- start, start + size - 1, write);
|
||||
+ if ( xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf,
|
||||
+ start, start + size - 1, !!write) != 0 )
|
||||
+ return 0;
|
||||
+
|
||||
+ return !write ||
|
||||
+ pci_conf_write_intercept(0, machine_bdf, start, size, write) >= 0;
|
||||
}
|
||||
|
||||
uint32_t guest_io_read(
|
||||
@@ -1796,7 +1800,7 @@ uint32_t guest_io_read(
|
||||
size = min(bytes, 4 - (port & 3));
|
||||
if ( size == 3 )
|
||||
size = 2;
|
||||
- if ( pci_cfg_ok(v->domain, 0, port & 3, size) )
|
||||
+ if ( pci_cfg_ok(v->domain, port & 3, size, NULL) )
|
||||
sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size);
|
||||
}
|
||||
|
||||
@@ -1869,7 +1873,7 @@ void guest_io_write(
|
||||
size = min(bytes, 4 - (port & 3));
|
||||
if ( size == 3 )
|
||||
size = 2;
|
||||
- if ( pci_cfg_ok(v->domain, 1, port & 3, size) )
|
||||
+ if ( pci_cfg_ok(v->domain, port & 3, size, &data) )
|
||||
pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data);
|
||||
}
|
||||
|
||||
--- a/xen/include/asm-x86/pci.h
|
||||
+++ b/xen/include/asm-x86/pci.h
|
||||
@@ -15,4 +15,11 @@ struct arch_pci_dev {
|
||||
vmask_t used_vectors;
|
||||
};
|
||||
|
||||
+struct pci_dev;
|
||||
+int pci_conf_write_intercept(unsigned int seg, unsigned int bdf,
|
||||
+ unsigned int reg, unsigned int size,
|
||||
+ uint32_t *data);
|
||||
+int pci_msi_conf_write_intercept(struct pci_dev *, unsigned int reg,
|
||||
+ unsigned int size, uint32_t *data);
|
||||
+
|
||||
#endif /* __X86_PCI_H__ */
|
@ -1,75 +0,0 @@
|
||||
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
|
||||
|
||||
# Commit 484d7c852e4ff79c945406ed28b5db63a5a0b7f3
|
||||
# Date 2015-07-23 10:14:13 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/MSI-X: track host and guest mask-all requests separately
|
||||
|
||||
Host uses of the bits will be added subsequently, and must not be
|
||||
overridden by guests (including Dom0, namely when acting on behalf of
|
||||
a guest).
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/msi.c
|
||||
+++ b/xen/arch/x86/msi.c
|
||||
@@ -843,6 +843,12 @@ static int msix_capability_init(struct p
|
||||
|
||||
if ( !msix->used_entries )
|
||||
{
|
||||
+ msix->host_maskall = 0;
|
||||
+ if ( !msix->guest_maskall )
|
||||
+ control &= ~PCI_MSIX_FLAGS_MASKALL;
|
||||
+ else
|
||||
+ control |= PCI_MSIX_FLAGS_MASKALL;
|
||||
+
|
||||
if ( rangeset_add_range(mmio_ro_ranges, msix->table.first,
|
||||
msix->table.last) )
|
||||
WARN();
|
||||
@@ -1111,6 +1117,34 @@ void pci_cleanup_msi(struct pci_dev *pde
|
||||
int pci_msi_conf_write_intercept(struct pci_dev *pdev, unsigned int reg,
|
||||
unsigned int size, uint32_t *data)
|
||||
{
|
||||
+ u16 seg = pdev->seg;
|
||||
+ u8 bus = pdev->bus;
|
||||
+ u8 slot = PCI_SLOT(pdev->devfn);
|
||||
+ u8 func = PCI_FUNC(pdev->devfn);
|
||||
+ struct msi_desc *entry;
|
||||
+ unsigned int pos;
|
||||
+
|
||||
+ if ( pdev->msix )
|
||||
+ {
|
||||
+ entry = find_msi_entry(pdev, -1, PCI_CAP_ID_MSIX);
|
||||
+ pos = entry ? entry->msi_attrib.pos
|
||||
+ : pci_find_cap_offset(seg, bus, slot, func,
|
||||
+ PCI_CAP_ID_MSIX);
|
||||
+ ASSERT(pos);
|
||||
+
|
||||
+ if ( reg < pos || reg >= msix_pba_offset_reg(pos) + 4 )
|
||||
+ return 0;
|
||||
+
|
||||
+ if ( reg != msix_control_reg(pos) || size != 2 )
|
||||
+ return -EACCES;
|
||||
+
|
||||
+ pdev->msix->guest_maskall = !!(*data & PCI_MSIX_FLAGS_MASKALL);
|
||||
+ if ( pdev->msix->host_maskall )
|
||||
+ *data |= PCI_MSIX_FLAGS_MASKALL;
|
||||
+
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
|
||||
--- a/xen/include/asm-x86/msi.h
|
||||
+++ b/xen/include/asm-x86/msi.h
|
||||
@@ -228,6 +228,7 @@ struct arch_msix {
|
||||
int table_refcnt[MAX_MSIX_TABLE_PAGES];
|
||||
int table_idx[MAX_MSIX_TABLE_PAGES];
|
||||
spinlock_t table_lock;
|
||||
+ bool_t host_maskall, guest_maskall;
|
||||
domid_t warned;
|
||||
};
|
||||
|
@ -1,351 +0,0 @@
|
||||
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
|
||||
|
||||
# Commit 082fdc6ce85e5b603f8fb24553cf200e3b67889f
|
||||
# Date 2015-07-23 10:14:59 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/MSI-X: be more careful during teardown
|
||||
|
||||
When a device gets detached from a guest, pciback will clear its
|
||||
command register, thus disabling both memory and I/O decoding. The
|
||||
disabled memory decoding, however, has an effect on the MSI-X table
|
||||
accesses the hypervisor does: These won't have the intended effect
|
||||
anymore. Even worse, for PCIe devices (but not SR-IOV virtual
|
||||
functions) such accesses may (will?) be treated as Unsupported
|
||||
Requests, causing respective errors to be surfaced, potentially in the
|
||||
form of NMIs that may be fatal to the hypervisor or Dom0 is different
|
||||
ways. Hence rather than carrying out these accesses, we should avoid
|
||||
them where we can, and use alternative (e.g. PCI config space based)
|
||||
mechanisms to achieve at least the same effect.
|
||||
|
||||
At this time it continues to be unclear whether this is fixing an
|
||||
actual bug or is rather just working around bogus (but apparently
|
||||
common) system behavior.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
---
|
||||
Backporting note (largely to myself):
|
||||
Depends on (not yet backported to 4.4 and earlier) commit 061eebe0e
|
||||
"x86/MSI: drop workaround for insecure Dom0 kernels" (due to re-use
|
||||
of struct arch_msix's warned field).
|
||||
|
||||
--- a/xen/arch/x86/irq.c
|
||||
+++ b/xen/arch/x86/irq.c
|
||||
@@ -217,9 +217,9 @@ void destroy_irq(unsigned int irq)
|
||||
}
|
||||
|
||||
spin_lock_irqsave(&desc->lock, flags);
|
||||
- desc->status |= IRQ_DISABLED;
|
||||
desc->status &= ~IRQ_GUEST;
|
||||
desc->handler->shutdown(desc);
|
||||
+ desc->status |= IRQ_DISABLED;
|
||||
action = desc->action;
|
||||
desc->action = NULL;
|
||||
desc->msi_desc = NULL;
|
||||
@@ -995,8 +995,8 @@ void __init release_irq(unsigned int irq
|
||||
spin_lock_irqsave(&desc->lock,flags);
|
||||
action = desc->action;
|
||||
desc->action = NULL;
|
||||
- desc->status |= IRQ_DISABLED;
|
||||
desc->handler->shutdown(desc);
|
||||
+ desc->status |= IRQ_DISABLED;
|
||||
spin_unlock_irqrestore(&desc->lock,flags);
|
||||
|
||||
/* Wait to make sure it's not being used on another CPU */
|
||||
@@ -1732,8 +1732,8 @@ static irq_guest_action_t *__pirq_guest_
|
||||
BUG_ON(action->in_flight != 0);
|
||||
|
||||
/* Disabling IRQ before releasing the desc_lock avoids an IRQ storm. */
|
||||
- desc->status |= IRQ_DISABLED;
|
||||
desc->handler->disable(desc);
|
||||
+ desc->status |= IRQ_DISABLED;
|
||||
|
||||
/*
|
||||
* Mark any remaining pending EOIs as ready to flush.
|
||||
--- a/xen/arch/x86/msi.c
|
||||
+++ b/xen/arch/x86/msi.c
|
||||
@@ -123,6 +123,27 @@ static void msix_put_fixmap(struct arch_
|
||||
spin_unlock(&msix->table_lock);
|
||||
}
|
||||
|
||||
+static bool_t memory_decoded(const struct pci_dev *dev)
|
||||
+{
|
||||
+ u8 bus, slot, func;
|
||||
+
|
||||
+ if ( !dev->info.is_virtfn )
|
||||
+ {
|
||||
+ bus = dev->bus;
|
||||
+ slot = PCI_SLOT(dev->devfn);
|
||||
+ func = PCI_FUNC(dev->devfn);
|
||||
+ }
|
||||
+ else
|
||||
+ {
|
||||
+ bus = dev->info.physfn.bus;
|
||||
+ slot = PCI_SLOT(dev->info.physfn.devfn);
|
||||
+ func = PCI_FUNC(dev->info.physfn.devfn);
|
||||
+ }
|
||||
+
|
||||
+ return !!(pci_conf_read16(dev->seg, bus, slot, func, PCI_COMMAND) &
|
||||
+ PCI_COMMAND_MEMORY);
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* MSI message composition
|
||||
*/
|
||||
@@ -166,7 +187,7 @@ void msi_compose_msg(unsigned vector, co
|
||||
}
|
||||
}
|
||||
|
||||
-static void read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
|
||||
+static bool_t read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
|
||||
{
|
||||
switch ( entry->msi_attrib.type )
|
||||
{
|
||||
@@ -201,6 +222,8 @@ static void read_msi_msg(struct msi_desc
|
||||
{
|
||||
void __iomem *base = entry->mask_base;
|
||||
|
||||
+ if ( unlikely(!memory_decoded(entry->dev)) )
|
||||
+ return 0;
|
||||
msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
|
||||
msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
|
||||
msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
|
||||
@@ -212,6 +235,8 @@ static void read_msi_msg(struct msi_desc
|
||||
|
||||
if ( iommu_intremap )
|
||||
iommu_read_msi_from_ire(entry, msg);
|
||||
+
|
||||
+ return 1;
|
||||
}
|
||||
|
||||
static int write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
|
||||
@@ -262,6 +287,8 @@ static int write_msi_msg(struct msi_desc
|
||||
{
|
||||
void __iomem *base = entry->mask_base;
|
||||
|
||||
+ if ( unlikely(!memory_decoded(entry->dev)) )
|
||||
+ return -ENXIO;
|
||||
writel(msg->address_lo,
|
||||
base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
|
||||
writel(msg->address_hi,
|
||||
@@ -289,7 +316,8 @@ void set_msi_affinity(struct irq_desc *d
|
||||
ASSERT(spin_is_locked(&desc->lock));
|
||||
|
||||
memset(&msg, 0, sizeof(msg));
|
||||
- read_msi_msg(msi_desc, &msg);
|
||||
+ if ( !read_msi_msg(msi_desc, &msg) )
|
||||
+ return;
|
||||
|
||||
msg.data &= ~MSI_DATA_VECTOR_MASK;
|
||||
msg.data |= MSI_DATA_VECTOR(desc->arch.vector);
|
||||
@@ -349,23 +377,27 @@ int msi_maskable_irq(const struct msi_de
|
||||
|| entry->msi_attrib.maskbit;
|
||||
}
|
||||
|
||||
-static void msi_set_mask_bit(struct irq_desc *desc, bool_t host, bool_t guest)
|
||||
+static bool_t msi_set_mask_bit(struct irq_desc *desc, bool_t host, bool_t guest)
|
||||
{
|
||||
struct msi_desc *entry = desc->msi_desc;
|
||||
+ struct pci_dev *pdev;
|
||||
+ u16 seg;
|
||||
+ u8 bus, slot, func;
|
||||
bool_t flag = host || guest;
|
||||
|
||||
ASSERT(spin_is_locked(&desc->lock));
|
||||
BUG_ON(!entry || !entry->dev);
|
||||
+ pdev = entry->dev;
|
||||
+ seg = pdev->seg;
|
||||
+ bus = pdev->bus;
|
||||
+ slot = PCI_SLOT(pdev->devfn);
|
||||
+ func = PCI_FUNC(pdev->devfn);
|
||||
switch ( entry->msi_attrib.type )
|
||||
{
|
||||
case PCI_CAP_ID_MSI:
|
||||
if ( entry->msi_attrib.maskbit )
|
||||
{
|
||||
u32 mask_bits;
|
||||
- u16 seg = entry->dev->seg;
|
||||
- u8 bus = entry->dev->bus;
|
||||
- u8 slot = PCI_SLOT(entry->dev->devfn);
|
||||
- u8 func = PCI_FUNC(entry->dev->devfn);
|
||||
|
||||
mask_bits = pci_conf_read32(seg, bus, slot, func, entry->msi.mpos);
|
||||
mask_bits &= ~((u32)1 << entry->msi_attrib.entry_nr);
|
||||
@@ -374,25 +406,54 @@ static void msi_set_mask_bit(struct irq_
|
||||
}
|
||||
break;
|
||||
case PCI_CAP_ID_MSIX:
|
||||
- {
|
||||
- int offset = PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
|
||||
- writel(flag, entry->mask_base + offset);
|
||||
- readl(entry->mask_base + offset);
|
||||
- break;
|
||||
- }
|
||||
+ if ( likely(memory_decoded(pdev)) )
|
||||
+ {
|
||||
+ writel(flag, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
|
||||
+ readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
|
||||
+ break;
|
||||
+ }
|
||||
+ if ( flag )
|
||||
+ {
|
||||
+ u16 control;
|
||||
+ domid_t domid = pdev->domain->domain_id;
|
||||
+
|
||||
+ pdev->msix->host_maskall = 1;
|
||||
+ control = pci_conf_read16(seg, bus, slot, func,
|
||||
+ msix_control_reg(entry->msi_attrib.pos));
|
||||
+ if ( control & PCI_MSIX_FLAGS_MASKALL )
|
||||
+ break;
|
||||
+ pci_conf_write16(seg, bus, slot, func,
|
||||
+ msix_control_reg(entry->msi_attrib.pos),
|
||||
+ control | PCI_MSIX_FLAGS_MASKALL);
|
||||
+ if ( pdev->msix->warned != domid )
|
||||
+ {
|
||||
+ pdev->msix->warned = domid;
|
||||
+ printk(XENLOG_G_WARNING
|
||||
+ "cannot mask IRQ %d: masked MSI-X on Dom%d's %04x:%02x:%02x.%u\n",
|
||||
+ desc->irq, domid, pdev->seg, pdev->bus,
|
||||
+ PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
|
||||
+ }
|
||||
+ break;
|
||||
+ }
|
||||
+ /* fall through */
|
||||
default:
|
||||
- BUG();
|
||||
- break;
|
||||
+ return 0;
|
||||
}
|
||||
entry->msi_attrib.host_masked = host;
|
||||
entry->msi_attrib.guest_masked = guest;
|
||||
+
|
||||
+ return 1;
|
||||
}
|
||||
|
||||
static int msi_get_mask_bit(const struct msi_desc *entry)
|
||||
{
|
||||
- switch (entry->msi_attrib.type) {
|
||||
+ if ( !entry->dev )
|
||||
+ return -1;
|
||||
+
|
||||
+ switch ( entry->msi_attrib.type )
|
||||
+ {
|
||||
case PCI_CAP_ID_MSI:
|
||||
- if (!entry->dev || !entry->msi_attrib.maskbit)
|
||||
+ if ( !entry->msi_attrib.maskbit )
|
||||
break;
|
||||
return (pci_conf_read32(entry->dev->seg, entry->dev->bus,
|
||||
PCI_SLOT(entry->dev->devfn),
|
||||
@@ -400,6 +461,8 @@ static int msi_get_mask_bit(const struct
|
||||
entry->msi.mpos) >>
|
||||
entry->msi_attrib.entry_nr) & 1;
|
||||
case PCI_CAP_ID_MSIX:
|
||||
+ if ( unlikely(!memory_decoded(entry->dev)) )
|
||||
+ break;
|
||||
return readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) & 1;
|
||||
}
|
||||
return -1;
|
||||
@@ -407,12 +470,16 @@ static int msi_get_mask_bit(const struct
|
||||
|
||||
void mask_msi_irq(struct irq_desc *desc)
|
||||
{
|
||||
- msi_set_mask_bit(desc, 1, desc->msi_desc->msi_attrib.guest_masked);
|
||||
+ if ( unlikely(!msi_set_mask_bit(desc, 1,
|
||||
+ desc->msi_desc->msi_attrib.guest_masked)) )
|
||||
+ BUG_ON(!(desc->status & IRQ_DISABLED));
|
||||
}
|
||||
|
||||
void unmask_msi_irq(struct irq_desc *desc)
|
||||
{
|
||||
- msi_set_mask_bit(desc, 0, desc->msi_desc->msi_attrib.guest_masked);
|
||||
+ if ( unlikely(!msi_set_mask_bit(desc, 0,
|
||||
+ desc->msi_desc->msi_attrib.guest_masked)) )
|
||||
+ WARN();
|
||||
}
|
||||
|
||||
void guest_mask_msi_irq(struct irq_desc *desc, bool_t mask)
|
||||
@@ -422,13 +489,15 @@ void guest_mask_msi_irq(struct irq_desc
|
||||
|
||||
static unsigned int startup_msi_irq(struct irq_desc *desc)
|
||||
{
|
||||
- msi_set_mask_bit(desc, 0, !!(desc->status & IRQ_GUEST));
|
||||
+ if ( unlikely(!msi_set_mask_bit(desc, 0, !!(desc->status & IRQ_GUEST))) )
|
||||
+ WARN();
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void shutdown_msi_irq(struct irq_desc *desc)
|
||||
{
|
||||
- msi_set_mask_bit(desc, 1, 1);
|
||||
+ if ( unlikely(!msi_set_mask_bit(desc, 1, 1)) )
|
||||
+ BUG_ON(!(desc->status & IRQ_DISABLED));
|
||||
}
|
||||
|
||||
void ack_nonmaskable_msi_irq(struct irq_desc *desc)
|
||||
@@ -740,6 +809,9 @@ static int msix_capability_init(struct p
|
||||
control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
|
||||
msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
|
||||
|
||||
+ if ( unlikely(!memory_decoded(dev)) )
|
||||
+ return -ENXIO;
|
||||
+
|
||||
if ( desc )
|
||||
{
|
||||
entry = alloc_msi_entry(1);
|
||||
@@ -879,7 +951,8 @@ static int msix_capability_init(struct p
|
||||
++msix->used_entries;
|
||||
|
||||
/* Restore MSI-X enabled bits */
|
||||
- pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
|
||||
+ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
|
||||
+ control & ~PCI_MSIX_FLAGS_MASKALL);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1024,8 +1097,16 @@ static void __pci_disable_msix(struct ms
|
||||
|
||||
BUG_ON(list_empty(&dev->msi_list));
|
||||
|
||||
- writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
|
||||
-
|
||||
+ if ( likely(memory_decoded(dev)) )
|
||||
+ writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
|
||||
+ else if ( !(control & PCI_MSIX_FLAGS_MASKALL) )
|
||||
+ {
|
||||
+ printk(XENLOG_WARNING
|
||||
+ "cannot disable IRQ %d: masking MSI-X on %04x:%02x:%02x.%u\n",
|
||||
+ entry->irq, dev->seg, dev->bus,
|
||||
+ PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
|
||||
+ control |= PCI_MSIX_FLAGS_MASKALL;
|
||||
+ }
|
||||
pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
|
||||
|
||||
_pci_cleanup_msix(dev->msix);
|
||||
@@ -1199,15 +1280,24 @@ int pci_restore_msi_state(struct pci_dev
|
||||
nr = entry->msi.nvec;
|
||||
}
|
||||
else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
|
||||
+ {
|
||||
msix_set_enable(pdev, 0);
|
||||
+ if ( unlikely(!memory_decoded(pdev)) )
|
||||
+ {
|
||||
+ spin_unlock_irqrestore(&desc->lock, flags);
|
||||
+ return -ENXIO;
|
||||
+ }
|
||||
+ }
|
||||
|
||||
msg = entry->msg;
|
||||
write_msi_msg(entry, &msg);
|
||||
|
||||
for ( i = 0; ; )
|
||||
{
|
||||
- msi_set_mask_bit(desc, entry[i].msi_attrib.host_masked,
|
||||
- entry[i].msi_attrib.guest_masked);
|
||||
+ if ( unlikely(!msi_set_mask_bit(desc,
|
||||
+ entry[i].msi_attrib.host_masked,
|
||||
+ entry[i].msi_attrib.guest_masked)) )
|
||||
+ BUG();
|
||||
|
||||
if ( !--nr )
|
||||
break;
|
@ -1,335 +0,0 @@
|
||||
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
|
||||
|
||||
# Commit 0dba393db07331e9cff42df10e95b67547dfdb3e
|
||||
# Date 2015-07-23 10:15:39 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/MSI-X: access MSI-X table only after having enabled MSI-X
|
||||
|
||||
As done in Linux by f598282f51 ("PCI: Fix the NIU MSI-X problem in a
|
||||
better way") and its broken predecessor, make sure we don't access the
|
||||
MSI-X table without having enabled MSI-X first, using the mask-all flag
|
||||
instead to prevent interrupts from occurring.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/msi.c
|
||||
+++ b/xen/arch/x86/msi.c
|
||||
@@ -144,6 +144,17 @@ static bool_t memory_decoded(const struc
|
||||
PCI_COMMAND_MEMORY);
|
||||
}
|
||||
|
||||
+static bool_t msix_memory_decoded(const struct pci_dev *dev, unsigned int pos)
|
||||
+{
|
||||
+ u16 control = pci_conf_read16(dev->seg, dev->bus, PCI_SLOT(dev->devfn),
|
||||
+ PCI_FUNC(dev->devfn), msix_control_reg(pos));
|
||||
+
|
||||
+ if ( !(control & PCI_MSIX_FLAGS_ENABLE) )
|
||||
+ return 0;
|
||||
+
|
||||
+ return memory_decoded(dev);
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* MSI message composition
|
||||
*/
|
||||
@@ -222,7 +233,8 @@ static bool_t read_msi_msg(struct msi_de
|
||||
{
|
||||
void __iomem *base = entry->mask_base;
|
||||
|
||||
- if ( unlikely(!memory_decoded(entry->dev)) )
|
||||
+ if ( unlikely(!msix_memory_decoded(entry->dev,
|
||||
+ entry->msi_attrib.pos)) )
|
||||
return 0;
|
||||
msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
|
||||
msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
|
||||
@@ -287,7 +299,8 @@ static int write_msi_msg(struct msi_desc
|
||||
{
|
||||
void __iomem *base = entry->mask_base;
|
||||
|
||||
- if ( unlikely(!memory_decoded(entry->dev)) )
|
||||
+ if ( unlikely(!msix_memory_decoded(entry->dev,
|
||||
+ entry->msi_attrib.pos)) )
|
||||
return -ENXIO;
|
||||
writel(msg->address_lo,
|
||||
base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
|
||||
@@ -381,9 +394,9 @@ static bool_t msi_set_mask_bit(struct ir
|
||||
{
|
||||
struct msi_desc *entry = desc->msi_desc;
|
||||
struct pci_dev *pdev;
|
||||
- u16 seg;
|
||||
+ u16 seg, control;
|
||||
u8 bus, slot, func;
|
||||
- bool_t flag = host || guest;
|
||||
+ bool_t flag = host || guest, maskall;
|
||||
|
||||
ASSERT(spin_is_locked(&desc->lock));
|
||||
BUG_ON(!entry || !entry->dev);
|
||||
@@ -406,36 +419,45 @@ static bool_t msi_set_mask_bit(struct ir
|
||||
}
|
||||
break;
|
||||
case PCI_CAP_ID_MSIX:
|
||||
+ maskall = pdev->msix->host_maskall;
|
||||
+ control = pci_conf_read16(seg, bus, slot, func,
|
||||
+ msix_control_reg(entry->msi_attrib.pos));
|
||||
+ if ( unlikely(!(control & PCI_MSIX_FLAGS_ENABLE)) )
|
||||
+ {
|
||||
+ pdev->msix->host_maskall = 1;
|
||||
+ pci_conf_write16(seg, bus, slot, func,
|
||||
+ msix_control_reg(entry->msi_attrib.pos),
|
||||
+ control | (PCI_MSIX_FLAGS_ENABLE |
|
||||
+ PCI_MSIX_FLAGS_MASKALL));
|
||||
+ }
|
||||
if ( likely(memory_decoded(pdev)) )
|
||||
{
|
||||
writel(flag, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
|
||||
readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
|
||||
- break;
|
||||
+ if ( likely(control & PCI_MSIX_FLAGS_ENABLE) )
|
||||
+ break;
|
||||
+ flag = 1;
|
||||
}
|
||||
- if ( flag )
|
||||
+ else if ( flag && !(control & PCI_MSIX_FLAGS_MASKALL) )
|
||||
{
|
||||
- u16 control;
|
||||
domid_t domid = pdev->domain->domain_id;
|
||||
|
||||
- pdev->msix->host_maskall = 1;
|
||||
- control = pci_conf_read16(seg, bus, slot, func,
|
||||
- msix_control_reg(entry->msi_attrib.pos));
|
||||
- if ( control & PCI_MSIX_FLAGS_MASKALL )
|
||||
- break;
|
||||
- pci_conf_write16(seg, bus, slot, func,
|
||||
- msix_control_reg(entry->msi_attrib.pos),
|
||||
- control | PCI_MSIX_FLAGS_MASKALL);
|
||||
+ maskall = 1;
|
||||
if ( pdev->msix->warned != domid )
|
||||
{
|
||||
pdev->msix->warned = domid;
|
||||
printk(XENLOG_G_WARNING
|
||||
- "cannot mask IRQ %d: masked MSI-X on Dom%d's %04x:%02x:%02x.%u\n",
|
||||
+ "cannot mask IRQ %d: masking MSI-X on Dom%d's %04x:%02x:%02x.%u\n",
|
||||
desc->irq, domid, pdev->seg, pdev->bus,
|
||||
PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
|
||||
}
|
||||
- break;
|
||||
}
|
||||
- /* fall through */
|
||||
+ pdev->msix->host_maskall = maskall;
|
||||
+ if ( maskall || pdev->msix->guest_maskall )
|
||||
+ control |= PCI_MSIX_FLAGS_MASKALL;
|
||||
+ pci_conf_write16(seg, bus, slot, func,
|
||||
+ msix_control_reg(entry->msi_attrib.pos), control);
|
||||
+ return flag;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
@@ -461,7 +483,8 @@ static int msi_get_mask_bit(const struct
|
||||
entry->msi.mpos) >>
|
||||
entry->msi_attrib.entry_nr) & 1;
|
||||
case PCI_CAP_ID_MSIX:
|
||||
- if ( unlikely(!memory_decoded(entry->dev)) )
|
||||
+ if ( unlikely(!msix_memory_decoded(entry->dev,
|
||||
+ entry->msi_attrib.pos)) )
|
||||
break;
|
||||
return readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) & 1;
|
||||
}
|
||||
@@ -564,9 +587,31 @@ static struct msi_desc *alloc_msi_entry(
|
||||
|
||||
int setup_msi_irq(struct irq_desc *desc, struct msi_desc *msidesc)
|
||||
{
|
||||
- return __setup_msi_irq(desc, msidesc,
|
||||
- msi_maskable_irq(msidesc) ? &pci_msi_maskable
|
||||
- : &pci_msi_nonmaskable);
|
||||
+ const struct pci_dev *pdev = msidesc->dev;
|
||||
+ unsigned int cpos = msix_control_reg(msidesc->msi_attrib.pos);
|
||||
+ u16 control = ~0;
|
||||
+ int rc;
|
||||
+
|
||||
+ if ( msidesc->msi_attrib.type == PCI_CAP_ID_MSIX )
|
||||
+ {
|
||||
+ control = pci_conf_read16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
|
||||
+ PCI_FUNC(pdev->devfn), cpos);
|
||||
+ if ( !(control & PCI_MSIX_FLAGS_ENABLE) )
|
||||
+ pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
|
||||
+ PCI_FUNC(pdev->devfn), cpos,
|
||||
+ control | (PCI_MSIX_FLAGS_ENABLE |
|
||||
+ PCI_MSIX_FLAGS_MASKALL));
|
||||
+ }
|
||||
+
|
||||
+ rc = __setup_msi_irq(desc, msidesc,
|
||||
+ msi_maskable_irq(msidesc) ? &pci_msi_maskable
|
||||
+ : &pci_msi_nonmaskable);
|
||||
+
|
||||
+ if ( !(control & PCI_MSIX_FLAGS_ENABLE) )
|
||||
+ pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
|
||||
+ PCI_FUNC(pdev->devfn), cpos, control);
|
||||
+
|
||||
+ return rc;
|
||||
}
|
||||
|
||||
int __setup_msi_irq(struct irq_desc *desc, struct msi_desc *msidesc,
|
||||
@@ -803,20 +848,38 @@ static int msix_capability_init(struct p
|
||||
u8 bus = dev->bus;
|
||||
u8 slot = PCI_SLOT(dev->devfn);
|
||||
u8 func = PCI_FUNC(dev->devfn);
|
||||
+ bool_t maskall = msix->host_maskall;
|
||||
|
||||
ASSERT(spin_is_locked(&pcidevs_lock));
|
||||
|
||||
control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
|
||||
- msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
|
||||
+ /*
|
||||
+ * Ensure MSI-X interrupts are masked during setup. Some devices require
|
||||
+ * MSI-X to be enabled before we can touch the MSI-X registers. We need
|
||||
+ * to mask all the vectors to prevent interrupts coming in before they're
|
||||
+ * fully set up.
|
||||
+ */
|
||||
+ msix->host_maskall = 1;
|
||||
+ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
|
||||
+ control | (PCI_MSIX_FLAGS_ENABLE |
|
||||
+ PCI_MSIX_FLAGS_MASKALL));
|
||||
|
||||
if ( unlikely(!memory_decoded(dev)) )
|
||||
+ {
|
||||
+ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
|
||||
+ control & ~PCI_MSIX_FLAGS_ENABLE);
|
||||
return -ENXIO;
|
||||
+ }
|
||||
|
||||
if ( desc )
|
||||
{
|
||||
entry = alloc_msi_entry(1);
|
||||
if ( !entry )
|
||||
+ {
|
||||
+ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
|
||||
+ control & ~PCI_MSIX_FLAGS_ENABLE);
|
||||
return -ENOMEM;
|
||||
+ }
|
||||
ASSERT(msi);
|
||||
}
|
||||
|
||||
@@ -847,6 +910,8 @@ static int msix_capability_init(struct p
|
||||
{
|
||||
if ( !msi || !msi->table_base )
|
||||
{
|
||||
+ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
|
||||
+ control & ~PCI_MSIX_FLAGS_ENABLE);
|
||||
xfree(entry);
|
||||
return -ENXIO;
|
||||
}
|
||||
@@ -889,6 +954,8 @@ static int msix_capability_init(struct p
|
||||
|
||||
if ( idx < 0 )
|
||||
{
|
||||
+ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
|
||||
+ control & ~PCI_MSIX_FLAGS_ENABLE);
|
||||
xfree(entry);
|
||||
return idx;
|
||||
}
|
||||
@@ -915,7 +982,7 @@ static int msix_capability_init(struct p
|
||||
|
||||
if ( !msix->used_entries )
|
||||
{
|
||||
- msix->host_maskall = 0;
|
||||
+ maskall = 0;
|
||||
if ( !msix->guest_maskall )
|
||||
control &= ~PCI_MSIX_FLAGS_MASKALL;
|
||||
else
|
||||
@@ -951,8 +1018,8 @@ static int msix_capability_init(struct p
|
||||
++msix->used_entries;
|
||||
|
||||
/* Restore MSI-X enabled bits */
|
||||
- pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
|
||||
- control & ~PCI_MSIX_FLAGS_MASKALL);
|
||||
+ msix->host_maskall = maskall;
|
||||
+ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1092,8 +1159,15 @@ static void __pci_disable_msix(struct ms
|
||||
PCI_CAP_ID_MSIX);
|
||||
u16 control = pci_conf_read16(seg, bus, slot, func,
|
||||
msix_control_reg(entry->msi_attrib.pos));
|
||||
+ bool_t maskall = dev->msix->host_maskall;
|
||||
|
||||
- msix_set_enable(dev, 0);
|
||||
+ if ( unlikely(!(control & PCI_MSIX_FLAGS_ENABLE)) )
|
||||
+ {
|
||||
+ dev->msix->host_maskall = 1;
|
||||
+ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
|
||||
+ control | (PCI_MSIX_FLAGS_ENABLE |
|
||||
+ PCI_MSIX_FLAGS_MASKALL));
|
||||
+ }
|
||||
|
||||
BUG_ON(list_empty(&dev->msi_list));
|
||||
|
||||
@@ -1105,8 +1179,11 @@ static void __pci_disable_msix(struct ms
|
||||
"cannot disable IRQ %d: masking MSI-X on %04x:%02x:%02x.%u\n",
|
||||
entry->irq, dev->seg, dev->bus,
|
||||
PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
|
||||
- control |= PCI_MSIX_FLAGS_MASKALL;
|
||||
+ maskall = 1;
|
||||
}
|
||||
+ dev->msix->host_maskall = maskall;
|
||||
+ if ( maskall || dev->msix->guest_maskall )
|
||||
+ control |= PCI_MSIX_FLAGS_MASKALL;
|
||||
pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
|
||||
|
||||
_pci_cleanup_msix(dev->msix);
|
||||
@@ -1255,6 +1332,8 @@ int pci_restore_msi_state(struct pci_dev
|
||||
list_for_each_entry_safe( entry, tmp, &pdev->msi_list, list )
|
||||
{
|
||||
unsigned int i = 0, nr = 1;
|
||||
+ u16 control = 0;
|
||||
+ u8 slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
|
||||
|
||||
irq = entry->irq;
|
||||
desc = &irq_desc[irq];
|
||||
@@ -1281,10 +1360,18 @@ int pci_restore_msi_state(struct pci_dev
|
||||
}
|
||||
else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
|
||||
{
|
||||
- msix_set_enable(pdev, 0);
|
||||
+ control = pci_conf_read16(pdev->seg, pdev->bus, slot, func,
|
||||
+ msix_control_reg(entry->msi_attrib.pos));
|
||||
+ pci_conf_write16(pdev->seg, pdev->bus, slot, func,
|
||||
+ msix_control_reg(entry->msi_attrib.pos),
|
||||
+ control | (PCI_MSIX_FLAGS_ENABLE |
|
||||
+ PCI_MSIX_FLAGS_MASKALL));
|
||||
if ( unlikely(!memory_decoded(pdev)) )
|
||||
{
|
||||
spin_unlock_irqrestore(&desc->lock, flags);
|
||||
+ pci_conf_write16(pdev->seg, pdev->bus, slot, func,
|
||||
+ msix_control_reg(entry->msi_attrib.pos),
|
||||
+ control & ~PCI_MSIX_FLAGS_ENABLE);
|
||||
return -ENXIO;
|
||||
}
|
||||
}
|
||||
@@ -1314,11 +1401,9 @@ int pci_restore_msi_state(struct pci_dev
|
||||
if ( entry->msi_attrib.type == PCI_CAP_ID_MSI )
|
||||
{
|
||||
unsigned int cpos = msi_control_reg(entry->msi_attrib.pos);
|
||||
- u16 control = pci_conf_read16(pdev->seg, pdev->bus,
|
||||
- PCI_SLOT(pdev->devfn),
|
||||
- PCI_FUNC(pdev->devfn), cpos);
|
||||
|
||||
- control &= ~PCI_MSI_FLAGS_QSIZE;
|
||||
+ control = pci_conf_read16(pdev->seg, pdev->bus, slot, func, cpos) &
|
||||
+ ~PCI_MSI_FLAGS_QSIZE;
|
||||
multi_msi_enable(control, entry->msi.nvec);
|
||||
pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
|
||||
PCI_FUNC(pdev->devfn), cpos, control);
|
||||
@@ -1326,7 +1411,9 @@ int pci_restore_msi_state(struct pci_dev
|
||||
msi_set_enable(pdev, 1);
|
||||
}
|
||||
else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
|
||||
- msix_set_enable(pdev, 1);
|
||||
+ pci_conf_write16(pdev->seg, pdev->bus, slot, func,
|
||||
+ msix_control_reg(entry->msi_attrib.pos),
|
||||
+ control | PCI_MSIX_FLAGS_ENABLE);
|
||||
}
|
||||
|
||||
return 0;
|
@ -1,55 +0,0 @@
|
||||
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
|
||||
|
||||
# Commit aa7c1fdf9dd04a1287f4770906b2c41b88a28228
|
||||
# Date 2015-07-23 10:16:27 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/MSI: properly track guest masking requests
|
||||
|
||||
... by monitoring writes to the mask register.
|
||||
|
||||
This allows reverting the main effect of the XSA-129 patches in qemu.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/msi.c
|
||||
+++ b/xen/arch/x86/msi.c
|
||||
@@ -1303,6 +1303,37 @@ int pci_msi_conf_write_intercept(struct
|
||||
return 1;
|
||||
}
|
||||
|
||||
+ entry = find_msi_entry(pdev, -1, PCI_CAP_ID_MSI);
|
||||
+ if ( entry && entry->msi_attrib.maskbit )
|
||||
+ {
|
||||
+ uint16_t cntl;
|
||||
+ uint32_t unused;
|
||||
+
|
||||
+ pos = entry->msi_attrib.pos;
|
||||
+ if ( reg < pos || reg >= entry->msi.mpos + 8 )
|
||||
+ return 0;
|
||||
+
|
||||
+ if ( reg == msi_control_reg(pos) )
|
||||
+ return size == 2 ? 1 : -EACCES;
|
||||
+ if ( reg < entry->msi.mpos || reg >= entry->msi.mpos + 4 || size != 4 )
|
||||
+ return -EACCES;
|
||||
+
|
||||
+ cntl = pci_conf_read16(seg, bus, slot, func, msi_control_reg(pos));
|
||||
+ unused = ~(uint32_t)0 >> (32 - multi_msi_capable(cntl));
|
||||
+ for ( pos = 0; pos < entry->msi.nvec; ++pos, ++entry )
|
||||
+ {
|
||||
+ entry->msi_attrib.guest_masked =
|
||||
+ *data >> entry->msi_attrib.entry_nr;
|
||||
+ if ( entry->msi_attrib.host_masked )
|
||||
+ *data |= 1 << pos;
|
||||
+ unused &= ~(1 << pos);
|
||||
+ }
|
||||
+
|
||||
+ *data |= unused;
|
||||
+
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
|
@ -1,63 +0,0 @@
|
||||
# Commit a7bd9b1661304500cd18b7d216d616ecf053ebdb
|
||||
# Date 2015-08-05 10:32:45 +0100
|
||||
# Author Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
# Committer Ian Campbell <ian.campbell@citrix.com>
|
||||
x86/gdt: Drop write-only, xalloc()'d array from set_gdt()
|
||||
|
||||
It is not used, and can cause a spurious failure of the set_gdt() hypercall in
|
||||
low memory situations.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: Wei Liu <wei.liu2@citrix.com>
|
||||
Reviewed-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
Reviewed-by: George Dunlap <george.dunlap@eu.citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -4383,20 +4383,15 @@ long set_gdt(struct vcpu *v,
|
||||
l1_pgentry_t *pl1e;
|
||||
/* NB. There are 512 8-byte entries per GDT page. */
|
||||
int i, nr_pages = (entries + 511) / 512;
|
||||
- unsigned long mfn, *pfns;
|
||||
|
||||
if ( entries > FIRST_RESERVED_GDT_ENTRY )
|
||||
return -EINVAL;
|
||||
|
||||
- pfns = xmalloc_array(unsigned long, nr_pages);
|
||||
- if ( !pfns )
|
||||
- return -ENOMEM;
|
||||
-
|
||||
/* Check the pages in the new GDT. */
|
||||
for ( i = 0; i < nr_pages; i++ )
|
||||
{
|
||||
struct page_info *page;
|
||||
- pfns[i] = frames[i];
|
||||
+
|
||||
page = get_page_from_gfn(d, frames[i], NULL, P2M_ALLOC);
|
||||
if ( !page )
|
||||
goto fail;
|
||||
@@ -4405,7 +4400,7 @@ long set_gdt(struct vcpu *v,
|
||||
put_page(page);
|
||||
goto fail;
|
||||
}
|
||||
- mfn = frames[i] = page_to_mfn(page);
|
||||
+ frames[i] = page_to_mfn(page);
|
||||
}
|
||||
|
||||
/* Tear down the old GDT. */
|
||||
@@ -4420,7 +4415,6 @@ long set_gdt(struct vcpu *v,
|
||||
l1e_write(&pl1e[i], l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
|
||||
}
|
||||
|
||||
- xfree(pfns);
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
@@ -4428,7 +4422,6 @@ long set_gdt(struct vcpu *v,
|
||||
{
|
||||
put_page_and_type(mfn_to_page(frames[i]));
|
||||
}
|
||||
- xfree(pfns);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
@ -1,169 +0,0 @@
|
||||
# Commit 0174da5b79752e2d5d6ca0faed89536e8f3d91c7
|
||||
# Date 2015-08-06 10:04:43 +0100
|
||||
# Author Anshul Makkar <anshul.makkar@citrix.com>
|
||||
# Committer Ian Campbell <ian.campbell@citrix.com>
|
||||
x86/mm: Make {hap, shadow}_teardown() preemptible
|
||||
|
||||
A domain with sufficient shadow allocation can cause a watchdog timeout
|
||||
during domain destruction. Expand the existing -ERESTART logic in
|
||||
paging_teardown() to allow {hap/sh}_set_allocation() to become
|
||||
restartable during the DOMCTL_destroydomain hypercall.
|
||||
|
||||
Signed-off-by: Anshul Makkar <anshul.makkar@citrix.com>
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: Tim Deegan <tim@xen.org>
|
||||
Reviewed-by: George Dunlap <george.dunlap@eu.citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/mm/hap/hap.c
|
||||
+++ b/xen/arch/x86/mm/hap/hap.c
|
||||
@@ -503,7 +503,7 @@ void hap_final_teardown(struct domain *d
|
||||
}
|
||||
|
||||
if ( d->arch.paging.hap.total_pages != 0 )
|
||||
- hap_teardown(d);
|
||||
+ hap_teardown(d, NULL);
|
||||
|
||||
p2m_teardown(p2m_get_hostp2m(d));
|
||||
/* Free any memory that the p2m teardown released */
|
||||
@@ -513,7 +513,7 @@ void hap_final_teardown(struct domain *d
|
||||
paging_unlock(d);
|
||||
}
|
||||
|
||||
-void hap_teardown(struct domain *d)
|
||||
+void hap_teardown(struct domain *d, int *preempted)
|
||||
{
|
||||
struct vcpu *v;
|
||||
mfn_t mfn;
|
||||
@@ -541,18 +541,11 @@ void hap_teardown(struct domain *d)
|
||||
|
||||
if ( d->arch.paging.hap.total_pages != 0 )
|
||||
{
|
||||
- HAP_PRINTK("teardown of domain %u starts."
|
||||
- " pages total = %u, free = %u, p2m=%u\n",
|
||||
- d->domain_id,
|
||||
- d->arch.paging.hap.total_pages,
|
||||
- d->arch.paging.hap.free_pages,
|
||||
- d->arch.paging.hap.p2m_pages);
|
||||
- hap_set_allocation(d, 0, NULL);
|
||||
- HAP_PRINTK("teardown done."
|
||||
- " pages total = %u, free = %u, p2m=%u\n",
|
||||
- d->arch.paging.hap.total_pages,
|
||||
- d->arch.paging.hap.free_pages,
|
||||
- d->arch.paging.hap.p2m_pages);
|
||||
+ hap_set_allocation(d, 0, preempted);
|
||||
+
|
||||
+ if ( preempted && *preempted )
|
||||
+ goto out;
|
||||
+
|
||||
ASSERT(d->arch.paging.hap.total_pages == 0);
|
||||
}
|
||||
|
||||
@@ -561,6 +554,7 @@ void hap_teardown(struct domain *d)
|
||||
xfree(d->arch.hvm_domain.dirty_vram);
|
||||
d->arch.hvm_domain.dirty_vram = NULL;
|
||||
|
||||
+out:
|
||||
paging_unlock(d);
|
||||
}
|
||||
|
||||
--- a/xen/arch/x86/mm/paging.c
|
||||
+++ b/xen/arch/x86/mm/paging.c
|
||||
@@ -779,12 +779,15 @@ long paging_domctl_continuation(XEN_GUES
|
||||
/* Call when destroying a domain */
|
||||
int paging_teardown(struct domain *d)
|
||||
{
|
||||
- int rc;
|
||||
+ int rc, preempted = 0;
|
||||
|
||||
if ( hap_enabled(d) )
|
||||
- hap_teardown(d);
|
||||
+ hap_teardown(d, &preempted);
|
||||
else
|
||||
- shadow_teardown(d);
|
||||
+ shadow_teardown(d, &preempted);
|
||||
+
|
||||
+ if ( preempted )
|
||||
+ return -ERESTART;
|
||||
|
||||
/* clean up log dirty resources. */
|
||||
rc = paging_free_log_dirty_bitmap(d, 0);
|
||||
--- a/xen/arch/x86/mm/shadow/common.c
|
||||
+++ b/xen/arch/x86/mm/shadow/common.c
|
||||
@@ -3030,7 +3030,7 @@ int shadow_enable(struct domain *d, u32
|
||||
return rv;
|
||||
}
|
||||
|
||||
-void shadow_teardown(struct domain *d)
|
||||
+void shadow_teardown(struct domain *d, int *preempted)
|
||||
/* Destroy the shadow pagetables of this domain and free its shadow memory.
|
||||
* Should only be called for dying domains. */
|
||||
{
|
||||
@@ -3091,23 +3091,16 @@ void shadow_teardown(struct domain *d)
|
||||
|
||||
if ( d->arch.paging.shadow.total_pages != 0 )
|
||||
{
|
||||
- SHADOW_PRINTK("teardown of domain %u starts."
|
||||
- " Shadow pages total = %u, free = %u, p2m=%u\n",
|
||||
- d->domain_id,
|
||||
- d->arch.paging.shadow.total_pages,
|
||||
- d->arch.paging.shadow.free_pages,
|
||||
- d->arch.paging.shadow.p2m_pages);
|
||||
/* Destroy all the shadows and release memory to domheap */
|
||||
- sh_set_allocation(d, 0, NULL);
|
||||
+ sh_set_allocation(d, 0, preempted);
|
||||
+
|
||||
+ if ( preempted && *preempted )
|
||||
+ goto out;
|
||||
+
|
||||
/* Release the hash table back to xenheap */
|
||||
if (d->arch.paging.shadow.hash_table)
|
||||
shadow_hash_teardown(d);
|
||||
- /* Should not have any more memory held */
|
||||
- SHADOW_PRINTK("teardown done."
|
||||
- " Shadow pages total = %u, free = %u, p2m=%u\n",
|
||||
- d->arch.paging.shadow.total_pages,
|
||||
- d->arch.paging.shadow.free_pages,
|
||||
- d->arch.paging.shadow.p2m_pages);
|
||||
+
|
||||
ASSERT(d->arch.paging.shadow.total_pages == 0);
|
||||
}
|
||||
|
||||
@@ -3138,6 +3131,7 @@ void shadow_teardown(struct domain *d)
|
||||
d->arch.hvm_domain.dirty_vram = NULL;
|
||||
}
|
||||
|
||||
+out:
|
||||
paging_unlock(d);
|
||||
|
||||
/* Must be called outside the lock */
|
||||
@@ -3159,7 +3153,7 @@ void shadow_final_teardown(struct domain
|
||||
* It is possible for a domain that never got domain_kill()ed
|
||||
* to get here with its shadow allocation intact. */
|
||||
if ( d->arch.paging.shadow.total_pages != 0 )
|
||||
- shadow_teardown(d);
|
||||
+ shadow_teardown(d, NULL);
|
||||
|
||||
/* It is now safe to pull down the p2m map. */
|
||||
p2m_teardown(p2m_get_hostp2m(d));
|
||||
--- a/xen/include/asm-x86/hap.h
|
||||
+++ b/xen/include/asm-x86/hap.h
|
||||
@@ -54,7 +54,7 @@ int hap_domctl(struct domain *d, xen_d
|
||||
XEN_GUEST_HANDLE_PARAM(void) u_domctl);
|
||||
int hap_enable(struct domain *d, u32 mode);
|
||||
void hap_final_teardown(struct domain *d);
|
||||
-void hap_teardown(struct domain *d);
|
||||
+void hap_teardown(struct domain *d, int *preempted);
|
||||
void hap_vcpu_init(struct vcpu *v);
|
||||
int hap_track_dirty_vram(struct domain *d,
|
||||
unsigned long begin_pfn,
|
||||
--- a/xen/include/asm-x86/shadow.h
|
||||
+++ b/xen/include/asm-x86/shadow.h
|
||||
@@ -72,7 +72,7 @@ int shadow_domctl(struct domain *d,
|
||||
XEN_GUEST_HANDLE_PARAM(void) u_domctl);
|
||||
|
||||
/* Call when destroying a domain */
|
||||
-void shadow_teardown(struct domain *d);
|
||||
+void shadow_teardown(struct domain *d, int *preempted);
|
||||
|
||||
/* Call once all of the references to the domain have gone away */
|
||||
void shadow_final_teardown(struct domain *d);
|
@ -1,96 +0,0 @@
|
||||
# Commit 22c5675877c8209adcfdb6bceddb561320374529
|
||||
# Date 2015-08-25 16:17:13 +0200
|
||||
# Author Aravind Gopalakrishnan <aravind.gopalakrishnan@amd.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86, amd_ucode: skip microcode updates for final levels
|
||||
|
||||
Some of older[Fam10h] systems require that certain number of
|
||||
applied microcode patch levels should not be overwritten by
|
||||
the microcode loader. Otherwise, system hangs are known to occur.
|
||||
|
||||
The 'final_levels' of patch ids have been obtained empirically.
|
||||
Refer bug https://bugzilla.suse.com/show_bug.cgi?id=913996
|
||||
for details of the issue.
|
||||
|
||||
The short version is that people have predominantly noticed
|
||||
system hang issues when trying to update microcode levels
|
||||
beyond the patch IDs below.
|
||||
[0x01000098, 0x0100009f, 0x010000af]
|
||||
|
||||
From internal discussions, we gathered that OS/hypervisor
|
||||
cannot reliably perform microcode updates beyond these levels
|
||||
due to hardware issues. Therefore, we need to abort microcode
|
||||
update process if we hit any of these levels.
|
||||
|
||||
In this patch, we check for those microcode versions and abort
|
||||
if the current core has one of those final patch levels applied
|
||||
by the BIOS
|
||||
|
||||
A linux version of the patch has already made it into tip-
|
||||
http://marc.info/?l=linux-kernel&m=143703405627170
|
||||
|
||||
Signed-off-by: Aravind Gopalakrishnan <aravind.gopalakrishnan@amd.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
|
||||
|
||||
--- a/xen/arch/x86/microcode_amd.c
|
||||
+++ b/xen/arch/x86/microcode_amd.c
|
||||
@@ -347,6 +347,43 @@ static int container_fast_forward(const
|
||||
return 0;
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * The 'final_levels' of patch ids have been obtained empirically.
|
||||
+ * Refer bug https://bugzilla.suse.com/show_bug.cgi?id=913996
|
||||
+ * for details of the issue. The short version is that people
|
||||
+ * using certain Fam10h systems noticed system hang issues when
|
||||
+ * trying to update microcode levels beyond the patch IDs below.
|
||||
+ * From internal discussions, we gathered that OS/hypervisor
|
||||
+ * cannot reliably perform microcode updates beyond these levels
|
||||
+ * due to hardware issues. Therefore, we need to abort microcode
|
||||
+ * update process if we hit any of these levels.
|
||||
+ */
|
||||
+static const unsigned int final_levels[] = {
|
||||
+ 0x01000098,
|
||||
+ 0x0100009f,
|
||||
+ 0x010000af
|
||||
+};
|
||||
+
|
||||
+static bool_t check_final_patch_levels(unsigned int cpu)
|
||||
+{
|
||||
+ /*
|
||||
+ * Check the current patch levels on the cpu. If they are equal to
|
||||
+ * any of the 'final_levels', then we should not update the microcode
|
||||
+ * patch on the cpu as system will hang otherwise.
|
||||
+ */
|
||||
+ struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu);
|
||||
+ unsigned int i;
|
||||
+
|
||||
+ if ( boot_cpu_data.x86 != 0x10 )
|
||||
+ return 0;
|
||||
+
|
||||
+ for ( i = 0; i < ARRAY_SIZE(final_levels); i++ )
|
||||
+ if ( uci->cpu_sig.rev == final_levels[i] )
|
||||
+ return 1;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
static int cpu_request_microcode(int cpu, const void *buf, size_t bufsize)
|
||||
{
|
||||
struct microcode_amd *mc_amd, *mc_old;
|
||||
@@ -369,6 +406,14 @@ static int cpu_request_microcode(int cpu
|
||||
goto out;
|
||||
}
|
||||
|
||||
+ if ( check_final_patch_levels(cpu) )
|
||||
+ {
|
||||
+ printk(XENLOG_INFO
|
||||
+ "microcode: Cannot update microcode patch on the cpu as we hit a final level\n");
|
||||
+ error = -EPERM;
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
mc_amd = xmalloc(struct microcode_amd);
|
||||
if ( !mc_amd )
|
||||
{
|
@ -1,21 +0,0 @@
|
||||
# Commit 5f335544cf5b716b0af51223e33373c4a7d65e8c
|
||||
# Date 2015-08-27 17:40:38 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
IOMMU: skip domains without page tables when dumping
|
||||
|
||||
Reported-by: Roger Pau Monné <roger.pau@citrix.com>
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Tested-by: Roger Pau Monné <roger.pau@citrix.com>
|
||||
|
||||
--- a/xen/drivers/passthrough/iommu.c
|
||||
+++ b/xen/drivers/passthrough/iommu.c
|
||||
@@ -368,7 +368,7 @@ static void iommu_dump_p2m_table(unsigne
|
||||
ops = iommu_get_ops();
|
||||
for_each_domain(d)
|
||||
{
|
||||
- if ( is_hardware_domain(d) )
|
||||
+ if ( is_hardware_domain(d) || need_iommu(d) <= 0 )
|
||||
continue;
|
||||
|
||||
if ( iommu_use_hap_pt(d) )
|
@ -1,95 +0,0 @@
|
||||
# Commit 8f945d36d9bddd5b589ba23c7322b30d623dd084
|
||||
# Date 2015-08-31 13:51:52 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/NUMA: fix setup_node()
|
||||
|
||||
The function referenced an __initdata object (nodes_found). Since this
|
||||
being a node mask was more complicated than needed, the variable gets
|
||||
replaced by a simple counter. Check at once that the count of nodes
|
||||
doesn't go beyond MAX_NUMNODES.
|
||||
|
||||
Also consolidate three printk()s related to the function's use into just
|
||||
one.
|
||||
|
||||
Finally (quite the opposite of the above issue) __init-annotate
|
||||
nodes_cover_memory().
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/srat.c
|
||||
+++ b/xen/arch/x86/srat.c
|
||||
@@ -25,7 +25,6 @@ static struct acpi_table_slit *__read_mo
|
||||
|
||||
static nodemask_t memory_nodes_parsed __initdata;
|
||||
static nodemask_t processor_nodes_parsed __initdata;
|
||||
-static nodemask_t nodes_found __initdata;
|
||||
static struct node nodes[MAX_NUMNODES] __initdata;
|
||||
static u8 __read_mostly pxm2node[256] = { [0 ... 255] = NUMA_NO_NODE };
|
||||
|
||||
@@ -45,17 +44,25 @@ int pxm_to_node(int pxm)
|
||||
return (signed char)pxm2node[pxm];
|
||||
}
|
||||
|
||||
-__devinit int setup_node(int pxm)
|
||||
+int setup_node(int pxm)
|
||||
{
|
||||
unsigned node = pxm2node[pxm];
|
||||
- if (node == 0xff) {
|
||||
- if (nodes_weight(nodes_found) >= MAX_NUMNODES)
|
||||
+
|
||||
+ if (node == NUMA_NO_NODE) {
|
||||
+ static bool_t warned;
|
||||
+ static unsigned nodes_found;
|
||||
+
|
||||
+ node = nodes_found++;
|
||||
+ if (node >= MAX_NUMNODES) {
|
||||
+ printk(KERN_WARNING
|
||||
+ "SRAT: Too many proximity domains (%#x)\n",
|
||||
+ pxm);
|
||||
+ warned = 1;
|
||||
return -1;
|
||||
- node = first_unset_node(nodes_found);
|
||||
- node_set(node, nodes_found);
|
||||
+ }
|
||||
pxm2node[pxm] = node;
|
||||
}
|
||||
- return pxm2node[pxm];
|
||||
+ return node;
|
||||
}
|
||||
|
||||
int valid_numa_range(u64 start, u64 end, int node)
|
||||
@@ -176,7 +183,6 @@ acpi_numa_x2apic_affinity_init(struct ac
|
||||
pxm = pa->proximity_domain;
|
||||
node = setup_node(pxm);
|
||||
if (node < 0) {
|
||||
- printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
|
||||
bad_srat();
|
||||
return;
|
||||
}
|
||||
@@ -209,7 +215,6 @@ acpi_numa_processor_affinity_init(struct
|
||||
}
|
||||
node = setup_node(pxm);
|
||||
if (node < 0) {
|
||||
- printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
|
||||
bad_srat();
|
||||
return;
|
||||
}
|
||||
@@ -253,7 +258,6 @@ acpi_numa_memory_affinity_init(struct ac
|
||||
pxm &= 0xff;
|
||||
node = setup_node(pxm);
|
||||
if (node < 0) {
|
||||
- printk(KERN_ERR "SRAT: Too many proximity domains.\n");
|
||||
bad_srat();
|
||||
return;
|
||||
}
|
||||
@@ -295,7 +299,7 @@ acpi_numa_memory_affinity_init(struct ac
|
||||
|
||||
/* Sanity check to catch more bad SRATs (they are amazingly common).
|
||||
Make sure the PXMs cover all memory. */
|
||||
-static int nodes_cover_memory(void)
|
||||
+static int __init nodes_cover_memory(void)
|
||||
{
|
||||
int i;
|
||||
|
@ -1,132 +0,0 @@
|
||||
# Commit c011f470e6e79208f5baa071b4d072b78c88e2ba
|
||||
# Date 2015-08-31 13:52:24 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/NUMA: don't account hotplug regions
|
||||
|
||||
... except in cases where they really matter: node_memblk_range[] now
|
||||
is the only place all regions get stored. nodes[] and NODE_DATA() track
|
||||
present memory only. This improves the reporting when nodes have
|
||||
disjoint "normal" and hotplug regions, with the hotplug region sitting
|
||||
above the highest populated page. In such cases a node's spanned-pages
|
||||
value (visible in both XEN_SYSCTL_numainfo and 'u' debug key output)
|
||||
covered all the way up to top of populated memory, giving quite
|
||||
different a picture from what an otherwise identically configured
|
||||
system without and hotplug regions would report. Note, however, that
|
||||
the actual hotplug case (as well as cases of nodes with multiple
|
||||
disjoint present regions) is still not being handled such that the
|
||||
reported values would represent how much memory a node really has (but
|
||||
that can be considered intentional).
|
||||
|
||||
Reported-by: Jim Fehlig <jfehlig@suse.com>
|
||||
|
||||
This at once makes nodes_cover_memory() no longer consider E820_RAM
|
||||
regions covered by SRAT hotplug regions.
|
||||
|
||||
Also reject self-overlaps with mismatching hotplug flags.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Tested-by: Jim Fehlig <jfehlig@suse.com>
|
||||
|
||||
--- a/xen/arch/x86/srat.c
|
||||
+++ b/xen/arch/x86/srat.c
|
||||
@@ -32,7 +32,7 @@ static u8 __read_mostly pxm2node[256] =
|
||||
static int num_node_memblks;
|
||||
static struct node node_memblk_range[NR_NODE_MEMBLKS];
|
||||
static int memblk_nodeid[NR_NODE_MEMBLKS];
|
||||
-
|
||||
+static __initdata DECLARE_BITMAP(memblk_hotplug, NR_NODE_MEMBLKS);
|
||||
|
||||
static int node_to_pxm(int n);
|
||||
|
||||
@@ -89,9 +89,9 @@ static __init int conflicting_memblks(u6
|
||||
if (nd->start == nd->end)
|
||||
continue;
|
||||
if (nd->end > start && nd->start < end)
|
||||
- return memblk_nodeid[i];
|
||||
+ return i;
|
||||
if (nd->end == end && nd->start == start)
|
||||
- return memblk_nodeid[i];
|
||||
+ return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
@@ -229,7 +229,6 @@ acpi_numa_processor_affinity_init(struct
|
||||
void __init
|
||||
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
|
||||
{
|
||||
- struct node *nd;
|
||||
u64 start, end;
|
||||
int node, pxm;
|
||||
int i;
|
||||
@@ -263,30 +262,40 @@ acpi_numa_memory_affinity_init(struct ac
|
||||
}
|
||||
/* It is fine to add this area to the nodes data it will be used later*/
|
||||
i = conflicting_memblks(start, end);
|
||||
- if (i == node) {
|
||||
- printk(KERN_WARNING
|
||||
- "SRAT: Warning: PXM %d (%"PRIx64"-%"PRIx64") overlaps with itself (%"
|
||||
- PRIx64"-%"PRIx64")\n", pxm, start, end, nodes[i].start, nodes[i].end);
|
||||
- } else if (i >= 0) {
|
||||
+ if (i < 0)
|
||||
+ /* everything fine */;
|
||||
+ else if (memblk_nodeid[i] == node) {
|
||||
+ bool_t mismatch = !(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) !=
|
||||
+ !test_bit(i, memblk_hotplug);
|
||||
+
|
||||
+ printk("%sSRAT: PXM %u (%"PRIx64"-%"PRIx64") overlaps with itself (%"PRIx64"-%"PRIx64")\n",
|
||||
+ mismatch ? KERN_ERR : KERN_WARNING, pxm, start, end,
|
||||
+ node_memblk_range[i].start, node_memblk_range[i].end);
|
||||
+ if (mismatch) {
|
||||
+ bad_srat();
|
||||
+ return;
|
||||
+ }
|
||||
+ } else {
|
||||
printk(KERN_ERR
|
||||
- "SRAT: PXM %d (%"PRIx64"-%"PRIx64") overlaps with PXM %d (%"
|
||||
- PRIx64"-%"PRIx64")\n", pxm, start, end, node_to_pxm(i),
|
||||
- nodes[i].start, nodes[i].end);
|
||||
+ "SRAT: PXM %u (%"PRIx64"-%"PRIx64") overlaps with PXM %u (%"PRIx64"-%"PRIx64")\n",
|
||||
+ pxm, start, end, node_to_pxm(memblk_nodeid[i]),
|
||||
+ node_memblk_range[i].start, node_memblk_range[i].end);
|
||||
bad_srat();
|
||||
return;
|
||||
}
|
||||
- nd = &nodes[node];
|
||||
- if (!node_test_and_set(node, memory_nodes_parsed)) {
|
||||
- nd->start = start;
|
||||
- nd->end = end;
|
||||
- } else {
|
||||
- if (start < nd->start)
|
||||
+ if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)) {
|
||||
+ struct node *nd = &nodes[node];
|
||||
+
|
||||
+ if (!node_test_and_set(node, memory_nodes_parsed)) {
|
||||
nd->start = start;
|
||||
- if (nd->end < end)
|
||||
nd->end = end;
|
||||
+ } else {
|
||||
+ if (start < nd->start)
|
||||
+ nd->start = start;
|
||||
+ if (nd->end < end)
|
||||
+ nd->end = end;
|
||||
+ }
|
||||
}
|
||||
- if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && end > mem_hotplug)
|
||||
- mem_hotplug = end;
|
||||
printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"%s\n",
|
||||
node, pxm, start, end,
|
||||
ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE ? " (hotplug)" : "");
|
||||
@@ -294,6 +303,11 @@ acpi_numa_memory_affinity_init(struct ac
|
||||
node_memblk_range[num_node_memblks].start = start;
|
||||
node_memblk_range[num_node_memblks].end = end;
|
||||
memblk_nodeid[num_node_memblks] = node;
|
||||
+ if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
|
||||
+ __set_bit(num_node_memblks, memblk_hotplug);
|
||||
+ if (end > mem_hotplug)
|
||||
+ mem_hotplug = end;
|
||||
+ }
|
||||
num_node_memblks++;
|
||||
}
|
||||
|
@ -1,176 +0,0 @@
|
||||
# Commit 88e3ed61642bb393458acc7a9bd2f96edc337190
|
||||
# Date 2015-09-01 14:02:57 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/NUMA: make init_node_heap() respect Xen heap limit
|
||||
|
||||
On NUMA systems, where we try to use node local memory for the basic
|
||||
control structures of the buddy allocator, this special case needs to
|
||||
take into consideration a possible address width limit placed on the
|
||||
Xen heap. In turn this (but also other, more abstract considerations)
|
||||
requires that xenheap_max_mfn() not be called more than once (at most
|
||||
we might permit it to be called a second time with a larger value than
|
||||
was passed the first time), and be called only before calling
|
||||
end_boot_allocator().
|
||||
|
||||
While inspecting all the involved code, a couple of off-by-one issues
|
||||
were found (and are being corrected here at once):
|
||||
- arch_init_memory() cleared one too many page table slots
|
||||
- the highmem_start based invocation of xenheap_max_mfn() passed too
|
||||
big a value
|
||||
- xenheap_max_mfn() calculated the wrong bit count in edge cases
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
|
||||
# Commit 0a7167d9b20cdc48e6ea320fbbb920b3267c9757
|
||||
# Date 2015-09-04 14:58:07 +0100
|
||||
# Author Julien Grall <julien.grall@citrix.com>
|
||||
# Committer Ian Campbell <ian.campbell@citrix.com>
|
||||
xen/arm64: do not (incorrectly) limit size of xenheap
|
||||
|
||||
The commit 88e3ed61642bb393458acc7a9bd2f96edc337190 "x86/NUMA: make
|
||||
init_node_heap() respect Xen heap limit" breaks boot on the arm64 board
|
||||
X-Gene.
|
||||
|
||||
The xenheap bits variable is used to know the last RAM MFN always mapped
|
||||
in Xen virtual memory. If the value is 0, it means that all the memory is
|
||||
always mapped in Xen virtual memory.
|
||||
|
||||
On X-gene the RAM bank resides above 128GB and last xenheap MFN is
|
||||
0x4400000. With the new way to calculate the number of bits, xenheap_bits
|
||||
will be equal to 38 bits. This will result to hide all the RAM and the
|
||||
impossibility to allocate xenheap memory.
|
||||
|
||||
Given that aarch64 have always all the memory mapped in Xen virtual
|
||||
memory, it's not necessary to call xenheap_max_mfn which set the number
|
||||
of bits.
|
||||
|
||||
Suggested-by: Jan Beulich <jbeulich@suse.com>
|
||||
Signed-off-by: Julien Grall <julien.grall@citrix.com>
|
||||
Acked-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
|
||||
--- a/xen/arch/arm/setup.c
|
||||
+++ b/xen/arch/arm/setup.c
|
||||
@@ -664,7 +664,6 @@ static void __init setup_mm(unsigned lon
|
||||
xenheap_virt_end = XENHEAP_VIRT_START + ram_end - ram_start;
|
||||
xenheap_mfn_start = ram_start >> PAGE_SHIFT;
|
||||
xenheap_mfn_end = ram_end >> PAGE_SHIFT;
|
||||
- xenheap_max_mfn(xenheap_mfn_end);
|
||||
|
||||
/*
|
||||
* Need enough mapped pages for copying the DTB.
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -372,7 +372,7 @@ void __init arch_init_memory(void)
|
||||
|
||||
for ( i = 0; i < l3_table_offset(split_va); ++i )
|
||||
l3tab[i] = l3idle[i];
|
||||
- for ( ; i <= L3_PAGETABLE_ENTRIES; ++i )
|
||||
+ for ( ; i < L3_PAGETABLE_ENTRIES; ++i )
|
||||
l3tab[i] = l3e_empty();
|
||||
split_l4e = l4e_from_pfn(virt_to_mfn(l3tab),
|
||||
__PAGE_HYPERVISOR);
|
||||
--- a/xen/arch/x86/setup.c
|
||||
+++ b/xen/arch/x86/setup.c
|
||||
@@ -970,7 +970,7 @@ void __init noreturn __start_xen(unsigne
|
||||
|
||||
setup_max_pdx(raw_max_page);
|
||||
if ( highmem_start )
|
||||
- xenheap_max_mfn(PFN_DOWN(highmem_start));
|
||||
+ xenheap_max_mfn(PFN_DOWN(highmem_start - 1));
|
||||
|
||||
/*
|
||||
* Walk every RAM region and map it in its entirety (on x86/64, at least)
|
||||
@@ -1151,9 +1151,6 @@ void __init noreturn __start_xen(unsigne
|
||||
|
||||
numa_initmem_init(0, raw_max_page);
|
||||
|
||||
- end_boot_allocator();
|
||||
- system_state = SYS_STATE_boot;
|
||||
-
|
||||
if ( max_page - 1 > virt_to_mfn(HYPERVISOR_VIRT_END - 1) )
|
||||
{
|
||||
unsigned long limit = virt_to_mfn(HYPERVISOR_VIRT_END - 1);
|
||||
@@ -1162,6 +1159,8 @@ void __init noreturn __start_xen(unsigne
|
||||
if ( !highmem_start )
|
||||
xenheap_max_mfn(limit);
|
||||
|
||||
+ end_boot_allocator();
|
||||
+
|
||||
/* Pass the remaining memory to the allocator. */
|
||||
for ( i = 0; i < boot_e820.nr_map; i++ )
|
||||
{
|
||||
@@ -1185,6 +1184,10 @@ void __init noreturn __start_xen(unsigne
|
||||
opt_tmem = 0;
|
||||
}
|
||||
}
|
||||
+ else
|
||||
+ end_boot_allocator();
|
||||
+
|
||||
+ system_state = SYS_STATE_boot;
|
||||
|
||||
vm_init();
|
||||
console_init_ring();
|
||||
--- a/xen/common/page_alloc.c
|
||||
+++ b/xen/common/page_alloc.c
|
||||
@@ -405,13 +405,19 @@ void get_outstanding_claims(uint64_t *fr
|
||||
spin_unlock(&heap_lock);
|
||||
}
|
||||
|
||||
+static bool_t __read_mostly first_node_initialised;
|
||||
+#ifndef CONFIG_SEPARATE_XENHEAP
|
||||
+static unsigned int __read_mostly xenheap_bits;
|
||||
+#else
|
||||
+#define xenheap_bits 0
|
||||
+#endif
|
||||
+
|
||||
static unsigned long init_node_heap(int node, unsigned long mfn,
|
||||
unsigned long nr, bool_t *use_tail)
|
||||
{
|
||||
/* First node to be discovered has its heap metadata statically alloced. */
|
||||
static heap_by_zone_and_order_t _heap_static;
|
||||
static unsigned long avail_static[NR_ZONES];
|
||||
- static int first_node_initialised;
|
||||
unsigned long needed = (sizeof(**_heap) +
|
||||
sizeof(**avail) * NR_ZONES +
|
||||
PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
@@ -429,14 +435,18 @@ static unsigned long init_node_heap(int
|
||||
}
|
||||
#ifdef DIRECTMAP_VIRT_END
|
||||
else if ( *use_tail && nr >= needed &&
|
||||
- (mfn + nr) <= (virt_to_mfn(eva - 1) + 1) )
|
||||
+ (mfn + nr) <= (virt_to_mfn(eva - 1) + 1) &&
|
||||
+ (!xenheap_bits ||
|
||||
+ !((mfn + nr - 1) >> (xenheap_bits - PAGE_SHIFT))) )
|
||||
{
|
||||
_heap[node] = mfn_to_virt(mfn + nr - needed);
|
||||
avail[node] = mfn_to_virt(mfn + nr - 1) +
|
||||
PAGE_SIZE - sizeof(**avail) * NR_ZONES;
|
||||
}
|
||||
else if ( nr >= needed &&
|
||||
- (mfn + needed) <= (virt_to_mfn(eva - 1) + 1) )
|
||||
+ (mfn + needed) <= (virt_to_mfn(eva - 1) + 1) &&
|
||||
+ (!xenheap_bits ||
|
||||
+ !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
|
||||
{
|
||||
_heap[node] = mfn_to_virt(mfn);
|
||||
avail[node] = mfn_to_virt(mfn + needed - 1) +
|
||||
@@ -1541,11 +1551,13 @@ void free_xenheap_pages(void *v, unsigne
|
||||
|
||||
#else
|
||||
|
||||
-static unsigned int __read_mostly xenheap_bits;
|
||||
-
|
||||
void __init xenheap_max_mfn(unsigned long mfn)
|
||||
{
|
||||
- xenheap_bits = fls(mfn) + PAGE_SHIFT;
|
||||
+ ASSERT(!first_node_initialised);
|
||||
+ ASSERT(!xenheap_bits);
|
||||
+ BUILD_BUG_ON(PADDR_BITS >= BITS_PER_LONG);
|
||||
+ xenheap_bits = min(fls(mfn + 1) - 1 + PAGE_SHIFT, PADDR_BITS);
|
||||
+ printk(XENLOG_INFO "Xen heap: %u bits\n", xenheap_bits);
|
||||
}
|
||||
|
||||
void init_xenheap_pages(paddr_t ps, paddr_t pe)
|
@ -1,68 +0,0 @@
|
||||
# Commit 244582a01dcb49fa30083725964a066937cc94f2
|
||||
# Date 2015-09-11 16:24:56 +0200
|
||||
# Author Kouya Shimura <kouya@jp.fujitsu.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/hvm: fix saved pmtimer and hpet values
|
||||
|
||||
The ACPI PM timer is sometimes broken on live migration.
|
||||
Since vcpu->arch.hvm_vcpu.guest_time is always zero in other than
|
||||
"delay for missed ticks mode". Even in "delay for missed ticks mode",
|
||||
vcpu's guest_time field is not valid (i.e. zero) when
|
||||
the state of vcpu is "blocked". (see pt_save_timer function)
|
||||
|
||||
The original author (Tim Deegan) of pmtimer_save() must have intended
|
||||
that it saves the last scheduled time of the vcpu. Unfortunately it was
|
||||
already implied this bug. FYI, there is no other timer mode than
|
||||
"delay for missed ticks mode" then.
|
||||
|
||||
For consistency with HPET, pmtimer_save() should refer hvm_get_guest_time()
|
||||
to update the counter as well as hpet_save() does.
|
||||
|
||||
Without this patch, the clock of windows server 2012R2 without HPET
|
||||
might leap forward several minutes on live migration.
|
||||
|
||||
Signed-off-by: Kouya Shimura <kouya@jp.fujitsu.com>
|
||||
|
||||
Retain use of ->arch.hvm_vcpu.guest_time when non-zero. Do the inverse
|
||||
adjustment for vHPET.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Tim Deegan <tim@xen.org>
|
||||
Reviewed-by: Kouya Shimura <kouya@jp.fujitsu.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/hpet.c
|
||||
+++ b/xen/arch/x86/hvm/hpet.c
|
||||
@@ -506,11 +506,13 @@ const struct hvm_mmio_handler hpet_mmio_
|
||||
static int hpet_save(struct domain *d, hvm_domain_context_t *h)
|
||||
{
|
||||
HPETState *hp = domain_vhpet(d);
|
||||
+ struct vcpu *v = pt_global_vcpu_target(d);
|
||||
int rc;
|
||||
uint64_t guest_time;
|
||||
|
||||
write_lock(&hp->lock);
|
||||
- guest_time = guest_time_hpet(hp);
|
||||
+ guest_time = (v->arch.hvm_vcpu.guest_time ?: hvm_get_guest_time(v)) /
|
||||
+ STIME_PER_HPET_TICK;
|
||||
|
||||
/* Write the proper value into the main counter */
|
||||
if ( hpet_enabled(hp) )
|
||||
--- a/xen/arch/x86/hvm/pmtimer.c
|
||||
+++ b/xen/arch/x86/hvm/pmtimer.c
|
||||
@@ -250,10 +250,12 @@ static int pmtimer_save(struct domain *d
|
||||
|
||||
spin_lock(&s->lock);
|
||||
|
||||
- /* Update the counter to the guest's current time. We always save
|
||||
- * with the domain paused, so the saved time should be after the
|
||||
- * last_gtime, but just in case, make sure we only go forwards */
|
||||
- x = ((s->vcpu->arch.hvm_vcpu.guest_time - s->last_gtime) * s->scale) >> 32;
|
||||
+ /*
|
||||
+ * Update the counter to the guest's current time. Make sure it only
|
||||
+ * goes forwards.
|
||||
+ */
|
||||
+ x = (((s->vcpu->arch.hvm_vcpu.guest_time ?: hvm_get_guest_time(s->vcpu)) -
|
||||
+ s->last_gtime) * s->scale) >> 32;
|
||||
if ( x < 1UL<<31 )
|
||||
s->pm.tmr_val += x;
|
||||
if ( (s->pm.tmr_val & TMR_VAL_MSB) != msb )
|
@ -1,23 +0,0 @@
|
||||
# Commit c7d5d5d8ea1ecbd6ef8b47dace4dec825f0f6e48
|
||||
# Date 2015-09-16 11:20:27 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/MSI: fail if no hardware support
|
||||
|
||||
This is to guard against buggy callers (luckily Dom0 only) invoking
|
||||
the respective hypercall for a device not being MSI-capable.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/msi.c
|
||||
+++ b/xen/arch/x86/msi.c
|
||||
@@ -696,6 +696,8 @@ static int msi_capability_init(struct pc
|
||||
|
||||
ASSERT(spin_is_locked(&pcidevs_lock));
|
||||
pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSI);
|
||||
+ if ( !pos )
|
||||
+ return -ENODEV;
|
||||
control = pci_conf_read16(seg, bus, slot, func, msi_control_reg(pos));
|
||||
maxvec = multi_msi_capable(control);
|
||||
if ( nvec > maxvec )
|
@ -34,9 +34,11 @@ Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Tested-by: David Vrabel <david.vrabel@citrix.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -505,12 +505,12 @@ void update_cr3(struct vcpu *v)
|
||||
Index: xen-4.6.0-testing/xen/arch/x86/mm.c
|
||||
===================================================================
|
||||
--- xen-4.6.0-testing.orig/xen/arch/x86/mm.c
|
||||
+++ xen-4.6.0-testing/xen/arch/x86/mm.c
|
||||
@@ -502,12 +502,12 @@ void update_cr3(struct vcpu *v)
|
||||
make_cr3(v, cr3_mfn);
|
||||
}
|
||||
|
||||
@ -51,7 +53,7 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
struct page_info *page;
|
||||
|
||||
BUG_ON(unlikely(in_irq()));
|
||||
@@ -525,10 +525,10 @@ static void invalidate_shadow_ldt(struct
|
||||
@@ -522,10 +522,10 @@ static void invalidate_shadow_ldt(struct
|
||||
|
||||
for ( i = 16; i < 32; i++ )
|
||||
{
|
||||
@ -65,7 +67,7 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page);
|
||||
ASSERT_PAGE_IS_DOMAIN(page, v->domain);
|
||||
put_page_and_type(page);
|
||||
@@ -4360,16 +4360,18 @@ long do_update_va_mapping_otherdomain(un
|
||||
@@ -4420,16 +4420,18 @@ long do_update_va_mapping_otherdomain(un
|
||||
void destroy_gdt(struct vcpu *v)
|
||||
{
|
||||
l1_pgentry_t *pl1e;
|
||||
@ -88,7 +90,7 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
v->arch.pv_vcpu.gdt_frames[i] = 0;
|
||||
}
|
||||
}
|
||||
@@ -4382,7 +4384,7 @@ long set_gdt(struct vcpu *v,
|
||||
@@ -4442,7 +4444,7 @@ long set_gdt(struct vcpu *v,
|
||||
struct domain *d = v->domain;
|
||||
l1_pgentry_t *pl1e;
|
||||
/* NB. There are 512 8-byte entries per GDT page. */
|
||||
|
@ -1,77 +0,0 @@
|
||||
# Commit 86f3ff9fc4cc3cb69b96c1de74bcc51f738fe2b9
|
||||
# Date 2015-09-25 09:08:22 +0200
|
||||
# Author Quan Xu <quan.xu@intel.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
vt-d: fix IM bit mask and unmask of Fault Event Control Register
|
||||
|
||||
Bit 0:29 in Fault Event Control Register are 'Reserved and Preserved',
|
||||
software cannot write 0 to it unconditionally. Software must preserve
|
||||
the value read for writes.
|
||||
|
||||
Signed-off-by: Quan Xu <quan.xu@intel.com>
|
||||
Acked-by: Yang Zhang <yang.z.zhang@intel.com>
|
||||
|
||||
# Commit 26b300bd727ef00a8f60329212a83c3b027a48f7
|
||||
# Date 2015-09-25 18:03:04 +0200
|
||||
# Author Quan Xu <quan.xu@intel.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
vt-d: fix IM bit unmask of Fault Event Control Register in init_vtd_hw()
|
||||
|
||||
Bit 0:29 in Fault Event Control Register are 'Reserved and Preserved',
|
||||
software cannot write 0 to it unconditionally. Software must preserve
|
||||
the value read for writes.
|
||||
|
||||
Suggested-by: Jan Beulich <jbeulich@suse.com>
|
||||
Signed-off-by: Quan Xu <quan.xu@intel.com>
|
||||
|
||||
--- a/xen/drivers/passthrough/vtd/iommu.c
|
||||
+++ b/xen/drivers/passthrough/vtd/iommu.c
|
||||
@@ -991,10 +991,13 @@ static void dma_msi_unmask(struct irq_de
|
||||
{
|
||||
struct iommu *iommu = desc->action->dev_id;
|
||||
unsigned long flags;
|
||||
+ u32 sts;
|
||||
|
||||
/* unmask it */
|
||||
spin_lock_irqsave(&iommu->register_lock, flags);
|
||||
- dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
|
||||
+ sts = dmar_readl(iommu->reg, DMAR_FECTL_REG);
|
||||
+ sts &= ~DMA_FECTL_IM;
|
||||
+ dmar_writel(iommu->reg, DMAR_FECTL_REG, sts);
|
||||
spin_unlock_irqrestore(&iommu->register_lock, flags);
|
||||
iommu->msi.msi_attrib.host_masked = 0;
|
||||
}
|
||||
@@ -1003,10 +1006,13 @@ static void dma_msi_mask(struct irq_desc
|
||||
{
|
||||
unsigned long flags;
|
||||
struct iommu *iommu = desc->action->dev_id;
|
||||
+ u32 sts;
|
||||
|
||||
/* mask it */
|
||||
spin_lock_irqsave(&iommu->register_lock, flags);
|
||||
- dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM);
|
||||
+ sts = dmar_readl(iommu->reg, DMAR_FECTL_REG);
|
||||
+ sts |= DMA_FECTL_IM;
|
||||
+ dmar_writel(iommu->reg, DMAR_FECTL_REG, sts);
|
||||
spin_unlock_irqrestore(&iommu->register_lock, flags);
|
||||
iommu->msi.msi_attrib.host_masked = 1;
|
||||
}
|
||||
@@ -2002,6 +2008,7 @@ static int init_vtd_hw(void)
|
||||
struct iommu_flush *flush = NULL;
|
||||
int ret;
|
||||
unsigned long flags;
|
||||
+ u32 sts;
|
||||
|
||||
/*
|
||||
* Basic VT-d HW init: set VT-d interrupt, clear VT-d faults.
|
||||
@@ -2015,7 +2022,9 @@ static int init_vtd_hw(void)
|
||||
clear_fault_bits(iommu);
|
||||
|
||||
spin_lock_irqsave(&iommu->register_lock, flags);
|
||||
- dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
|
||||
+ sts = dmar_readl(iommu->reg, DMAR_FECTL_REG);
|
||||
+ sts &= ~DMA_FECTL_IM;
|
||||
+ dmar_writel(iommu->reg, DMAR_FECTL_REG, sts);
|
||||
spin_unlock_irqrestore(&iommu->register_lock, flags);
|
||||
}
|
||||
|
@ -1,48 +0,0 @@
|
||||
# Commit 6c0e4ad60850032c9bbd5d18b8446421c97e08e4
|
||||
# Date 2015-09-29 10:25:29 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/EPT: tighten conditions of IOMMU mapping updates
|
||||
|
||||
Permission changes should also result in updates or TLB flushes.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Kevin Tian <kevin.tian@intel.com>
|
||||
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/mm/p2m-ept.c
|
||||
+++ b/xen/arch/x86/mm/p2m-ept.c
|
||||
@@ -619,6 +619,7 @@ ept_set_entry(struct p2m_domain *p2m, un
|
||||
uint8_t ipat = 0;
|
||||
int need_modify_vtd_table = 1;
|
||||
int vtd_pte_present = 0;
|
||||
+ unsigned int iommu_flags = p2m_get_iommu_flags(p2mt);
|
||||
enum { sync_off, sync_on, sync_check } needs_sync = sync_check;
|
||||
ept_entry_t old_entry = { .epte = 0 };
|
||||
ept_entry_t new_entry = { .epte = 0 };
|
||||
@@ -749,8 +750,9 @@ ept_set_entry(struct p2m_domain *p2m, un
|
||||
new_entry.mfn = mfn_x(mfn);
|
||||
|
||||
/* Safe to read-then-write because we hold the p2m lock */
|
||||
- if ( ept_entry->mfn == new_entry.mfn )
|
||||
- need_modify_vtd_table = 0;
|
||||
+ if ( ept_entry->mfn == new_entry.mfn &&
|
||||
+ p2m_get_iommu_flags(ept_entry->sa_p2mt) == iommu_flags )
|
||||
+ need_modify_vtd_table = 0;
|
||||
|
||||
ept_p2m_type_to_flags(&new_entry, p2mt, p2ma);
|
||||
}
|
||||
@@ -775,11 +777,9 @@ out:
|
||||
iommu_pte_flush(d, gfn, &ept_entry->epte, order, vtd_pte_present);
|
||||
else
|
||||
{
|
||||
- unsigned int flags = p2m_get_iommu_flags(p2mt);
|
||||
-
|
||||
- if ( flags != 0 )
|
||||
+ if ( iommu_flags )
|
||||
for ( i = 0; i < (1 << order); i++ )
|
||||
- iommu_map_page(d, gfn + i, mfn_x(mfn) + i, flags);
|
||||
+ iommu_map_page(d, gfn + i, mfn_x(mfn) + i, iommu_flags);
|
||||
else
|
||||
for ( i = 0; i < (1 << order); i++ )
|
||||
iommu_unmap_page(d, gfn + i);
|
@ -1,97 +0,0 @@
|
||||
# Commit 960265fbd878cdc9841473b755e4ccc9eb1942d2
|
||||
# Date 2015-09-29 13:55:34 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/p2m-pt: delay freeing of intermediate page tables
|
||||
|
||||
Old intermediate page tables must be freed only after IOMMU side
|
||||
updates/flushes have got carried out.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/mm/p2m-pt.c
|
||||
+++ b/xen/arch/x86/mm/p2m-pt.c
|
||||
@@ -486,8 +486,9 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
|
||||
/* XXX -- this might be able to be faster iff current->domain == d */
|
||||
void *table;
|
||||
unsigned long i, gfn_remainder = gfn;
|
||||
- l1_pgentry_t *p2m_entry;
|
||||
- l1_pgentry_t entry_content;
|
||||
+ l1_pgentry_t *p2m_entry, entry_content;
|
||||
+ /* Intermediate table to free if we're replacing it with a superpage. */
|
||||
+ l1_pgentry_t intermediate_entry = l1e_empty();
|
||||
l2_pgentry_t l2e_content;
|
||||
l3_pgentry_t l3e_content;
|
||||
int rc;
|
||||
@@ -535,7 +536,6 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
|
||||
*/
|
||||
if ( page_order == PAGE_ORDER_1G )
|
||||
{
|
||||
- l1_pgentry_t old_entry = l1e_empty();
|
||||
p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
|
||||
L3_PAGETABLE_SHIFT - PAGE_SHIFT,
|
||||
L3_PAGETABLE_ENTRIES);
|
||||
@@ -545,7 +545,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
|
||||
{
|
||||
/* We're replacing a non-SP page with a superpage. Make sure to
|
||||
* handle freeing the table properly. */
|
||||
- old_entry = *p2m_entry;
|
||||
+ intermediate_entry = *p2m_entry;
|
||||
}
|
||||
|
||||
ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
|
||||
@@ -563,10 +563,6 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
|
||||
|
||||
p2m->write_p2m_entry(p2m, gfn, p2m_entry, entry_content, 3);
|
||||
/* NB: paging_write_p2m_entry() handles tlb flushes properly */
|
||||
-
|
||||
- /* Free old intermediate tables if necessary */
|
||||
- if ( l1e_get_flags(old_entry) & _PAGE_PRESENT )
|
||||
- p2m_free_entry(p2m, &old_entry, page_order);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -607,7 +603,6 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
|
||||
}
|
||||
else if ( page_order == PAGE_ORDER_2M )
|
||||
{
|
||||
- l1_pgentry_t old_entry = l1e_empty();
|
||||
p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
|
||||
L2_PAGETABLE_SHIFT - PAGE_SHIFT,
|
||||
L2_PAGETABLE_ENTRIES);
|
||||
@@ -619,7 +614,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
|
||||
{
|
||||
/* We're replacing a non-SP page with a superpage. Make sure to
|
||||
* handle freeing the table properly. */
|
||||
- old_entry = *p2m_entry;
|
||||
+ intermediate_entry = *p2m_entry;
|
||||
}
|
||||
|
||||
ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
|
||||
@@ -640,10 +635,6 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
|
||||
|
||||
p2m->write_p2m_entry(p2m, gfn, p2m_entry, entry_content, 2);
|
||||
/* NB: paging_write_p2m_entry() handles tlb flushes properly */
|
||||
-
|
||||
- /* Free old intermediate tables if necessary */
|
||||
- if ( l1e_get_flags(old_entry) & _PAGE_PRESENT )
|
||||
- p2m_free_entry(p2m, &old_entry, page_order);
|
||||
}
|
||||
|
||||
/* Track the highest gfn for which we have ever had a valid mapping */
|
||||
@@ -671,6 +662,14 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
|
||||
}
|
||||
}
|
||||
|
||||
+ /*
|
||||
+ * Free old intermediate tables if necessary. This has to be the
|
||||
+ * last thing we do, after removal from the IOMMU tables, so as to
|
||||
+ * avoid a potential use-after-free.
|
||||
+ */
|
||||
+ if ( l1e_get_flags(intermediate_entry) & _PAGE_PRESENT )
|
||||
+ p2m_free_entry(p2m, &intermediate_entry, page_order);
|
||||
+
|
||||
out:
|
||||
unmap_domain_page(table);
|
||||
return rc;
|
@ -1,22 +0,0 @@
|
||||
# Commit c0a85795d864dd64c116af661bf676d66ddfd5fc
|
||||
# Date 2015-09-29 13:56:03 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/p2m-pt: ignore pt-share flag for shadow mode guests
|
||||
|
||||
There is no page table sharing in shadow mode.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/mm/p2m-pt.c
|
||||
+++ b/xen/arch/x86/mm/p2m-pt.c
|
||||
@@ -644,7 +644,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
|
||||
|
||||
if ( iommu_enabled && need_iommu(p2m->domain) )
|
||||
{
|
||||
- if ( iommu_hap_pt_share )
|
||||
+ if ( iommu_use_hap_pt(p2m->domain) )
|
||||
{
|
||||
if ( old_mfn && (old_mfn != mfn_x(mfn)) )
|
||||
amd_iommu_flush_pages(p2m->domain, gfn, page_order);
|
@ -1,104 +0,0 @@
|
||||
# Commit ea5637968a09a81a64fa5fd73ce49b4ea9789e12
|
||||
# Date 2015-09-30 14:44:22 +0200
|
||||
# Author Dario Faggioli <dario.faggioli@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
credit1: fix tickling when it happens from a remote pCPU
|
||||
|
||||
especially if that is also from a different cpupool than the
|
||||
processor of the vCPU that triggered the tickling.
|
||||
|
||||
In fact, it is possible that we get as far as calling vcpu_unblock()-->
|
||||
vcpu_wake()-->csched_vcpu_wake()-->__runq_tickle() for the vCPU 'vc',
|
||||
but all while running on a pCPU that is different from 'vc->processor'.
|
||||
|
||||
For instance, this can happen when an HVM domain runs in a cpupool,
|
||||
with a different scheduler than the default one, and issues IOREQs
|
||||
to Dom0, running in Pool-0 with the default scheduler.
|
||||
In fact, right in this case, the following crash can be observed:
|
||||
|
||||
(XEN) ----[ Xen-4.7-unstable x86_64 debug=y Tainted: C ]----
|
||||
(XEN) CPU: 7
|
||||
(XEN) RIP: e008:[<ffff82d0801230de>] __runq_tickle+0x18f/0x430
|
||||
(XEN) RFLAGS: 0000000000010086 CONTEXT: hypervisor (d1v0)
|
||||
(XEN) rax: 0000000000000001 rbx: ffff8303184fee00 rcx: 0000000000000000
|
||||
(XEN) ... ... ...
|
||||
(XEN) Xen stack trace from rsp=ffff83031fa57a08:
|
||||
(XEN) ffff82d0801fe664 ffff82d08033c820 0000000100000002 0000000a00000001
|
||||
(XEN) 0000000000006831 0000000000000000 0000000000000000 0000000000000000
|
||||
(XEN) ... ... ...
|
||||
(XEN) Xen call trace:
|
||||
(XEN) [<ffff82d0801230de>] __runq_tickle+0x18f/0x430
|
||||
(XEN) [<ffff82d08012348a>] csched_vcpu_wake+0x10b/0x110
|
||||
(XEN) [<ffff82d08012b421>] vcpu_wake+0x20a/0x3ce
|
||||
(XEN) [<ffff82d08012b91c>] vcpu_unblock+0x4b/0x4e
|
||||
(XEN) [<ffff82d080167bd0>] vcpu_kick+0x17/0x61
|
||||
(XEN) [<ffff82d080167c46>] vcpu_mark_events_pending+0x2c/0x2f
|
||||
(XEN) [<ffff82d08010ac35>] evtchn_fifo_set_pending+0x381/0x3f6
|
||||
(XEN) [<ffff82d08010a0f6>] notify_via_xen_event_channel+0xc9/0xd6
|
||||
(XEN) [<ffff82d0801c29ed>] hvm_send_ioreq+0x3e9/0x441
|
||||
(XEN) [<ffff82d0801bba7d>] hvmemul_do_io+0x23f/0x2d2
|
||||
(XEN) [<ffff82d0801bbb43>] hvmemul_do_io_buffer+0x33/0x64
|
||||
(XEN) [<ffff82d0801bc92b>] hvmemul_do_pio_buffer+0x35/0x37
|
||||
(XEN) [<ffff82d0801cc49f>] handle_pio+0x58/0x14c
|
||||
(XEN) [<ffff82d0801eabcb>] vmx_vmexit_handler+0x16b3/0x1bea
|
||||
(XEN) [<ffff82d0801efd21>] vmx_asm_vmexit_handler+0x41/0xc0
|
||||
|
||||
In this case, pCPU 7 is not in Pool-0, while the (Dom0's) vCPU being
|
||||
woken is. pCPU's 7 pool has a different scheduler than credit, but it
|
||||
is, however, right from pCPU 7 that we are waking the Dom0's vCPUs.
|
||||
Therefore, the current code tries to access csched_balance_mask for
|
||||
pCPU 7, but that is not defined, and hence the Oops.
|
||||
|
||||
(Note that, in case the two pools run the same scheduler we see no
|
||||
Oops, but things are still conceptually wrong.)
|
||||
|
||||
Cure things by making the csched_balance_mask macro accept a
|
||||
parameter for fetching a specific pCPU's mask (instead than always
|
||||
using smp_processor_id()).
|
||||
|
||||
Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
|
||||
Reviewed-by: Juergen Gross <jgross@suse.com>
|
||||
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
|
||||
|
||||
--- a/xen/common/sched_credit.c
|
||||
+++ b/xen/common/sched_credit.c
|
||||
@@ -154,10 +154,10 @@ struct csched_pcpu {
|
||||
* Convenience macro for accessing the per-PCPU cpumask we need for
|
||||
* implementing the two steps (soft and hard affinity) balancing logic.
|
||||
* It is stored in csched_pcpu so that serialization is not an issue,
|
||||
- * as there is a csched_pcpu for each PCPU and we always hold the
|
||||
- * runqueue spin-lock when using this.
|
||||
+ * as there is a csched_pcpu for each PCPU, and we always hold the
|
||||
+ * runqueue lock for the proper PCPU when using this.
|
||||
*/
|
||||
-#define csched_balance_mask (CSCHED_PCPU(smp_processor_id())->balance_mask)
|
||||
+#define csched_balance_mask(c) (CSCHED_PCPU(c)->balance_mask)
|
||||
|
||||
/*
|
||||
* Virtual CPU
|
||||
@@ -396,9 +396,10 @@ __runq_tickle(unsigned int cpu, struct c
|
||||
|
||||
/* Are there idlers suitable for new (for this balance step)? */
|
||||
csched_balance_cpumask(new->vcpu, balance_step,
|
||||
- csched_balance_mask);
|
||||
- cpumask_and(csched_balance_mask, csched_balance_mask, &idle_mask);
|
||||
- new_idlers_empty = cpumask_empty(csched_balance_mask);
|
||||
+ csched_balance_mask(cpu));
|
||||
+ cpumask_and(csched_balance_mask(cpu),
|
||||
+ csched_balance_mask(cpu), &idle_mask);
|
||||
+ new_idlers_empty = cpumask_empty(csched_balance_mask(cpu));
|
||||
|
||||
/*
|
||||
* Let's not be too harsh! If there aren't idlers suitable
|
||||
@@ -1475,8 +1476,9 @@ csched_runq_steal(int peer_cpu, int cpu,
|
||||
&& !__vcpu_has_soft_affinity(vc, vc->cpu_hard_affinity) )
|
||||
continue;
|
||||
|
||||
- csched_balance_cpumask(vc, balance_step, csched_balance_mask);
|
||||
- if ( __csched_vcpu_is_migrateable(vc, cpu, csched_balance_mask) )
|
||||
+ csched_balance_cpumask(vc, balance_step, csched_balance_mask(cpu));
|
||||
+ if ( __csched_vcpu_is_migrateable(vc, cpu,
|
||||
+ csched_balance_mask(cpu)) )
|
||||
{
|
||||
/* We got a candidate. Grab it! */
|
||||
TRACE_3D(TRC_CSCHED_STOLEN_VCPU, peer_cpu,
|
@ -1,159 +0,0 @@
|
||||
# Commit 660fd65d5578a95ec5eac522128bba23325179eb
|
||||
# Date 2015-10-02 13:40:36 +0200
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/p2m-pt: tighten conditions of IOMMU mapping updates
|
||||
|
||||
Whether the MFN changes does not depend on the new entry being valid
|
||||
(but solely on the old one), and the need to update or TLB-flush also
|
||||
depends on permission changes.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/mm/p2m-pt.c
|
||||
+++ b/xen/arch/x86/mm/p2m-pt.c
|
||||
@@ -493,7 +493,18 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
|
||||
l3_pgentry_t l3e_content;
|
||||
int rc;
|
||||
unsigned int iommu_pte_flags = p2m_get_iommu_flags(p2mt);
|
||||
- unsigned long old_mfn = 0;
|
||||
+ /*
|
||||
+ * old_mfn and iommu_old_flags control possible flush/update needs on the
|
||||
+ * IOMMU: We need to flush when MFN or flags (i.e. permissions) change.
|
||||
+ * iommu_old_flags being initialized to zero covers the case of the entry
|
||||
+ * getting replaced being a non-present (leaf or intermediate) one. For
|
||||
+ * present leaf entries the real value will get calculated below, while
|
||||
+ * for present intermediate entries ~0 (guaranteed != iommu_pte_flags)
|
||||
+ * will be used (to cover all cases of what the leaf entries underneath
|
||||
+ * the intermediate one might be).
|
||||
+ */
|
||||
+ unsigned int flags, iommu_old_flags = 0;
|
||||
+ unsigned long old_mfn = INVALID_MFN;
|
||||
|
||||
if ( tb_init_done )
|
||||
{
|
||||
@@ -540,12 +551,20 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
|
||||
L3_PAGETABLE_SHIFT - PAGE_SHIFT,
|
||||
L3_PAGETABLE_ENTRIES);
|
||||
ASSERT(p2m_entry);
|
||||
- if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
|
||||
- !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
|
||||
+ flags = l1e_get_flags(*p2m_entry);
|
||||
+ if ( flags & _PAGE_PRESENT )
|
||||
{
|
||||
- /* We're replacing a non-SP page with a superpage. Make sure to
|
||||
- * handle freeing the table properly. */
|
||||
- intermediate_entry = *p2m_entry;
|
||||
+ if ( flags & _PAGE_PSE )
|
||||
+ {
|
||||
+ iommu_old_flags =
|
||||
+ p2m_get_iommu_flags(p2m_flags_to_type(flags));
|
||||
+ old_mfn = l1e_get_pfn(*p2m_entry);
|
||||
+ }
|
||||
+ else
|
||||
+ {
|
||||
+ iommu_old_flags = ~0;
|
||||
+ intermediate_entry = *p2m_entry;
|
||||
+ }
|
||||
}
|
||||
|
||||
ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
|
||||
@@ -556,10 +575,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
|
||||
entry_content.l1 = l3e_content.l3;
|
||||
|
||||
if ( entry_content.l1 != 0 )
|
||||
- {
|
||||
p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
|
||||
- old_mfn = l1e_get_pfn(*p2m_entry);
|
||||
- }
|
||||
|
||||
p2m->write_p2m_entry(p2m, gfn, p2m_entry, entry_content, 3);
|
||||
/* NB: paging_write_p2m_entry() handles tlb flushes properly */
|
||||
@@ -584,7 +600,10 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
|
||||
p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
|
||||
0, L1_PAGETABLE_ENTRIES);
|
||||
ASSERT(p2m_entry);
|
||||
-
|
||||
+ iommu_old_flags =
|
||||
+ p2m_get_iommu_flags(p2m_flags_to_type(l1e_get_flags(*p2m_entry)));
|
||||
+ old_mfn = l1e_get_pfn(*p2m_entry);
|
||||
+
|
||||
if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct)
|
||||
|| p2m_is_paging(p2mt) )
|
||||
entry_content = p2m_l1e_from_pfn(mfn_x(mfn),
|
||||
@@ -593,10 +612,8 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
|
||||
entry_content = l1e_empty();
|
||||
|
||||
if ( entry_content.l1 != 0 )
|
||||
- {
|
||||
p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
|
||||
- old_mfn = l1e_get_pfn(*p2m_entry);
|
||||
- }
|
||||
+
|
||||
/* level 1 entry */
|
||||
p2m->write_p2m_entry(p2m, gfn, p2m_entry, entry_content, 1);
|
||||
/* NB: paging_write_p2m_entry() handles tlb flushes properly */
|
||||
@@ -607,14 +624,20 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
|
||||
L2_PAGETABLE_SHIFT - PAGE_SHIFT,
|
||||
L2_PAGETABLE_ENTRIES);
|
||||
ASSERT(p2m_entry);
|
||||
-
|
||||
- /* FIXME: Deal with 4k replaced by 2meg pages */
|
||||
- if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
|
||||
- !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
|
||||
- {
|
||||
- /* We're replacing a non-SP page with a superpage. Make sure to
|
||||
- * handle freeing the table properly. */
|
||||
- intermediate_entry = *p2m_entry;
|
||||
+ flags = l1e_get_flags(*p2m_entry);
|
||||
+ if ( flags & _PAGE_PRESENT )
|
||||
+ {
|
||||
+ if ( flags & _PAGE_PSE )
|
||||
+ {
|
||||
+ iommu_old_flags =
|
||||
+ p2m_get_iommu_flags(p2m_flags_to_type(flags));
|
||||
+ old_mfn = l1e_get_pfn(*p2m_entry);
|
||||
+ }
|
||||
+ else
|
||||
+ {
|
||||
+ iommu_old_flags = ~0;
|
||||
+ intermediate_entry = *p2m_entry;
|
||||
+ }
|
||||
}
|
||||
|
||||
ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
|
||||
@@ -628,10 +651,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
|
||||
entry_content.l1 = l2e_content.l2;
|
||||
|
||||
if ( entry_content.l1 != 0 )
|
||||
- {
|
||||
p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
|
||||
- old_mfn = l1e_get_pfn(*p2m_entry);
|
||||
- }
|
||||
|
||||
p2m->write_p2m_entry(p2m, gfn, p2m_entry, entry_content, 2);
|
||||
/* NB: paging_write_p2m_entry() handles tlb flushes properly */
|
||||
@@ -642,17 +662,17 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
|
||||
&& (gfn + (1UL << page_order) - 1 > p2m->max_mapped_pfn) )
|
||||
p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1;
|
||||
|
||||
- if ( iommu_enabled && need_iommu(p2m->domain) )
|
||||
+ if ( iommu_enabled && need_iommu(p2m->domain) &&
|
||||
+ (iommu_old_flags != iommu_pte_flags || old_mfn != mfn_x(mfn)) )
|
||||
{
|
||||
if ( iommu_use_hap_pt(p2m->domain) )
|
||||
{
|
||||
- if ( old_mfn && (old_mfn != mfn_x(mfn)) )
|
||||
+ if ( iommu_old_flags )
|
||||
amd_iommu_flush_pages(p2m->domain, gfn, page_order);
|
||||
}
|
||||
else
|
||||
{
|
||||
- unsigned int flags = p2m_get_iommu_flags(p2mt);
|
||||
-
|
||||
+ flags = p2m_get_iommu_flags(p2mt);
|
||||
if ( flags != 0 )
|
||||
for ( i = 0; i < (1UL << page_order); i++ )
|
||||
iommu_map_page(p2m->domain, gfn+i, mfn_x(mfn)+i, flags);
|
@ -17,9 +17,11 @@ Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Yang Zhang <yang.z.zhang@intel.com>
|
||||
|
||||
--- a/xen/arch/x86/apic.c
|
||||
+++ b/xen/arch/x86/apic.c
|
||||
@@ -946,8 +946,18 @@ void __init x2apic_bsp_setup(void)
|
||||
Index: xen-4.6.0-testing/xen/arch/x86/apic.c
|
||||
===================================================================
|
||||
--- xen-4.6.0-testing.orig/xen/arch/x86/apic.c
|
||||
+++ xen-4.6.0-testing/xen/arch/x86/apic.c
|
||||
@@ -943,8 +943,18 @@ void __init x2apic_bsp_setup(void)
|
||||
mask_8259A();
|
||||
mask_IO_APIC_setup(ioapic_entries);
|
||||
|
||||
@ -39,9 +41,11 @@ Acked-by: Yang Zhang <yang.z.zhang@intel.com>
|
||||
if ( x2apic_enabled )
|
||||
panic("Interrupt remapping could not be enabled while "
|
||||
"x2APIC is already enabled by BIOS");
|
||||
--- a/xen/drivers/passthrough/vtd/intremap.c
|
||||
+++ b/xen/drivers/passthrough/vtd/intremap.c
|
||||
@@ -144,10 +144,10 @@ static void set_hpet_source_id(unsigned
|
||||
Index: xen-4.6.0-testing/xen/drivers/passthrough/vtd/intremap.c
|
||||
===================================================================
|
||||
--- xen-4.6.0-testing.orig/xen/drivers/passthrough/vtd/intremap.c
|
||||
+++ xen-4.6.0-testing/xen/drivers/passthrough/vtd/intremap.c
|
||||
@@ -143,10 +143,10 @@ static void set_hpet_source_id(unsigned
|
||||
set_ire_sid(ire, SVT_VERIFY_SID_SQ, SQ_13_IGNORE_3, hpetid_to_bdf(id));
|
||||
}
|
||||
|
||||
@ -54,7 +58,7 @@ Acked-by: Yang Zhang <yang.z.zhang@intel.com>
|
||||
|
||||
if ( !iommu_qinval || !iommu_intremap || list_empty(&acpi_drhd_units) )
|
||||
return 0;
|
||||
@@ -155,12 +155,12 @@ int iommu_supports_eim(void)
|
||||
@@ -154,12 +154,12 @@ int iommu_supports_eim(void)
|
||||
/* We MUST have a DRHD unit for each IOAPIC. */
|
||||
for ( apic = 0; apic < nr_ioapics; apic++ )
|
||||
if ( !ioapic_to_drhd(IO_APIC_ID(apic)) )
|
||||
@ -69,7 +73,7 @@ Acked-by: Yang Zhang <yang.z.zhang@intel.com>
|
||||
|
||||
for_each_drhd_unit ( drhd )
|
||||
if ( !ecap_queued_inval(drhd->iommu->ecap) ||
|
||||
@@ -834,10 +834,10 @@ int iommu_enable_x2apic_IR(void)
|
||||
@@ -833,10 +833,10 @@ int iommu_enable_x2apic_IR(void)
|
||||
struct iommu *iommu;
|
||||
|
||||
if ( !iommu_supports_eim() )
|
||||
@ -82,7 +86,7 @@ Acked-by: Yang Zhang <yang.z.zhang@intel.com>
|
||||
|
||||
for_each_drhd_unit ( drhd )
|
||||
{
|
||||
@@ -862,7 +862,7 @@ int iommu_enable_x2apic_IR(void)
|
||||
@@ -861,7 +861,7 @@ int iommu_enable_x2apic_IR(void)
|
||||
{
|
||||
dprintk(XENLOG_INFO VTDPREFIX,
|
||||
"Failed to enable Queued Invalidation!\n");
|
||||
@ -91,7 +95,7 @@ Acked-by: Yang Zhang <yang.z.zhang@intel.com>
|
||||
}
|
||||
}
|
||||
|
||||
@@ -874,7 +874,7 @@ int iommu_enable_x2apic_IR(void)
|
||||
@@ -873,7 +873,7 @@ int iommu_enable_x2apic_IR(void)
|
||||
{
|
||||
dprintk(XENLOG_INFO VTDPREFIX,
|
||||
"Failed to enable Interrupt Remapping!\n");
|
||||
@ -100,9 +104,11 @@ Acked-by: Yang Zhang <yang.z.zhang@intel.com>
|
||||
}
|
||||
}
|
||||
|
||||
--- a/xen/include/asm-x86/iommu.h
|
||||
+++ b/xen/include/asm-x86/iommu.h
|
||||
@@ -28,7 +28,7 @@ int iommu_setup_hpet_msi(struct msi_desc
|
||||
Index: xen-4.6.0-testing/xen/include/asm-x86/iommu.h
|
||||
===================================================================
|
||||
--- xen-4.6.0-testing.orig/xen/include/asm-x86/iommu.h
|
||||
+++ xen-4.6.0-testing/xen/include/asm-x86/iommu.h
|
||||
@@ -27,7 +27,7 @@ int iommu_setup_hpet_msi(struct msi_desc
|
||||
/* While VT-d specific, this must get declared in a generic header. */
|
||||
int adjust_vtd_irq_affinities(void);
|
||||
void iommu_pte_flush(struct domain *d, u64 gfn, u64 *pte, int order, int present);
|
||||
|
@ -12,17 +12,17 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/traps.c
|
||||
+++ b/xen/arch/x86/traps.c
|
||||
@@ -904,6 +904,7 @@ void pv_cpuid(struct cpu_user_regs *regs
|
||||
@@ -967,6 +967,7 @@ void pv_cpuid(struct cpu_user_regs *regs
|
||||
__clear_bit(X86_FEATURE_LWP % 32, &c);
|
||||
__clear_bit(X86_FEATURE_NODEID_MSR % 32, &c);
|
||||
__clear_bit(X86_FEATURE_TOPOEXT % 32, &c);
|
||||
+ __clear_bit(X86_FEATURE_MWAITX % 32, &c);
|
||||
break;
|
||||
|
||||
case 0x00000005: /* MONITOR/MWAIT */
|
||||
case 0x0000000a: /* Architectural Performance Monitor Features (Intel) */
|
||||
--- a/xen/include/asm-x86/cpufeature.h
|
||||
+++ b/xen/include/asm-x86/cpufeature.h
|
||||
@@ -137,6 +137,7 @@
|
||||
@@ -135,6 +135,7 @@
|
||||
#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */
|
||||
#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */
|
||||
#define X86_FEATURE_DBEXT (6*32+26) /* data breakpoint extension */
|
||||
|
@ -16,7 +16,7 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/numa.c
|
||||
+++ b/xen/arch/x86/numa.c
|
||||
@@ -347,7 +347,7 @@ void __init init_cpu_to_node(void)
|
||||
@@ -349,7 +349,7 @@ void __init init_cpu_to_node(void)
|
||||
u32 apicid = x86_cpu_to_apicid[i];
|
||||
if ( apicid == BAD_APICID )
|
||||
continue;
|
||||
@ -27,8 +27,8 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
numa_set_node(i, node);
|
||||
--- a/xen/arch/x86/setup.c
|
||||
+++ b/xen/arch/x86/setup.c
|
||||
@@ -191,7 +191,7 @@ void __devinit srat_detect_node(int cpu)
|
||||
unsigned node;
|
||||
@@ -200,7 +200,7 @@ void __devinit srat_detect_node(int cpu)
|
||||
nodeid_t node;
|
||||
u32 apicid = x86_cpu_to_apicid[cpu];
|
||||
|
||||
- node = apicid_to_node[apicid];
|
||||
@ -38,7 +38,7 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/smpboot.c
|
||||
+++ b/xen/arch/x86/smpboot.c
|
||||
@@ -885,7 +885,8 @@ int cpu_add(uint32_t apic_id, uint32_t a
|
||||
@@ -993,7 +993,8 @@ int cpu_add(uint32_t apic_id, uint32_t a
|
||||
cpu = node;
|
||||
goto out;
|
||||
}
|
||||
@ -50,15 +50,15 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
/* Physically added CPUs do not have synchronised TSC. */
|
||||
--- a/xen/arch/x86/srat.c
|
||||
+++ b/xen/arch/x86/srat.c
|
||||
@@ -170,7 +170,6 @@ void __init
|
||||
acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
|
||||
@@ -209,7 +209,6 @@ acpi_numa_x2apic_affinity_init(struct ac
|
||||
{
|
||||
int pxm, node;
|
||||
- int apic_id;
|
||||
unsigned pxm;
|
||||
nodeid_t node;
|
||||
- u32 apic_id;
|
||||
|
||||
if (srat_disabled())
|
||||
return;
|
||||
@@ -178,8 +177,13 @@ acpi_numa_x2apic_affinity_init(struct ac
|
||||
@@ -217,8 +216,13 @@ acpi_numa_x2apic_affinity_init(struct ac
|
||||
bad_srat();
|
||||
return;
|
||||
}
|
||||
@ -72,8 +72,8 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
+
|
||||
pxm = pa->proximity_domain;
|
||||
node = setup_node(pxm);
|
||||
if (node < 0) {
|
||||
@@ -187,11 +191,11 @@ acpi_numa_x2apic_affinity_init(struct ac
|
||||
if (node == NUMA_NO_NODE) {
|
||||
@@ -226,11 +230,11 @@ acpi_numa_x2apic_affinity_init(struct ac
|
||||
return;
|
||||
}
|
||||
|
||||
@ -89,7 +89,7 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
}
|
||||
|
||||
/* Callback for Proximity Domain -> LAPIC mapping */
|
||||
@@ -221,7 +225,7 @@ acpi_numa_processor_affinity_init(struct
|
||||
@@ -262,7 +266,7 @@ acpi_numa_processor_affinity_init(struct
|
||||
apicid_to_node[pa->apic_id] = node;
|
||||
node_set(node, processor_nodes_parsed);
|
||||
acpi_numa = 1;
|
||||
@ -100,7 +100,7 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/drivers/acpi/numa.c
|
||||
+++ b/xen/drivers/acpi/numa.c
|
||||
@@ -199,9 +199,9 @@ int __init acpi_numa_init(void)
|
||||
@@ -198,9 +198,9 @@ int __init acpi_numa_init(void)
|
||||
/* SRAT: Static Resource Affinity Table */
|
||||
if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
|
||||
acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
|
@ -0,0 +1,49 @@
|
||||
# Commit 29bcf64ce8bc0b1b7aacd00c8668f255c4f0686c
|
||||
# Date 2015-10-29 13:31:10 +0100
|
||||
# Author Julien Grall <julien.grall@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
arm: Support hypercall_create_continuation for multicall
|
||||
|
||||
Multicall for ARM has been supported since commit f0dbdc6 "xen: arm: fully
|
||||
implement multicall interface.". Although, if an hypercall in multicall
|
||||
requires preemption, it will crash the host:
|
||||
|
||||
(XEN) Xen BUG at domain.c:347
|
||||
(XEN) ----[ Xen-4.7-unstable arm64 debug=y Tainted: C ]----
|
||||
[...]
|
||||
(XEN) Xen call trace:
|
||||
(XEN) [<00000000002420cc>] hypercall_create_continuation+0x64/0x380 (PC)
|
||||
(XEN) [<0000000000217274>] do_memory_op+0x1b00/0x2334 (LR)
|
||||
(XEN) [<0000000000250d2c>] do_multicall_call+0x114/0x124
|
||||
(XEN) [<0000000000217ff0>] do_multicall+0x17c/0x23c
|
||||
(XEN) [<000000000024f97c>] do_trap_hypercall+0x90/0x12c
|
||||
(XEN) [<0000000000251ca8>] do_trap_hypervisor+0xd2c/0x1ba4
|
||||
(XEN) [<00000000002582cc>] guest_sync+0x88/0xb8
|
||||
(XEN)
|
||||
(XEN)
|
||||
(XEN) ****************************************
|
||||
(XEN) Panic on CPU 5:
|
||||
(XEN) Xen BUG at domain.c:347
|
||||
(XEN) ****************************************
|
||||
(XEN)
|
||||
(XEN) Manual reset required ('noreboot' specified)
|
||||
|
||||
Looking to the code, the support of multicall looks valid to me, as we only
|
||||
need to fill call.args[...]. So drop the BUG();
|
||||
|
||||
This is CVE-2015-7812 / XSA-145.
|
||||
|
||||
Signed-off-by: Julien Grall <julien.grall@citrix.com>
|
||||
Acked-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
|
||||
--- a/xen/arch/arm/domain.c
|
||||
+++ b/xen/arch/arm/domain.c
|
||||
@@ -344,8 +344,6 @@ unsigned long hypercall_create_continuat
|
||||
|
||||
if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
|
||||
{
|
||||
- BUG(); /* XXX multicalls not implemented yet. */
|
||||
-
|
||||
__set_bit(_MCSF_call_preempted, &mcs->flags);
|
||||
|
||||
for ( i = 0; *p != '\0'; i++ )
|
@ -0,0 +1,42 @@
|
||||
# Commit 1c0e59ff15764e7b0c59282365974f5b8924ce83
|
||||
# Date 2015-10-29 13:33:38 +0100
|
||||
# Author Ian Campbell <ian.campbell@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
arm: rate-limit logging from unimplemented PHYSDEVOP and HVMOP.
|
||||
|
||||
These are guest accessible and should therefore be rate-limited.
|
||||
Moreover, include them only in debug builds.
|
||||
|
||||
This is CVE-2015-7813 / XSA-146.
|
||||
|
||||
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/xen/arch/arm/hvm.c
|
||||
+++ b/xen/arch/arm/hvm.c
|
||||
@@ -57,7 +57,7 @@ long do_hvm_op(unsigned long op, XEN_GUE
|
||||
|
||||
default:
|
||||
{
|
||||
- printk("%s: Bad HVM op %ld.\n", __func__, op);
|
||||
+ gdprintk(XENLOG_DEBUG, "HVMOP op=%lu: not implemented\n", op);
|
||||
rc = -ENOSYS;
|
||||
break;
|
||||
}
|
||||
--- a/xen/arch/arm/physdev.c
|
||||
+++ b/xen/arch/arm/physdev.c
|
||||
@@ -8,12 +8,13 @@
|
||||
#include <xen/types.h>
|
||||
#include <xen/lib.h>
|
||||
#include <xen/errno.h>
|
||||
+#include <xen/sched.h>
|
||||
#include <asm/hypercall.h>
|
||||
|
||||
|
||||
int do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
|
||||
{
|
||||
- printk("%s %d cmd=%d: not implemented yet\n", __func__, __LINE__, cmd);
|
||||
+ gdprintk(XENLOG_DEBUG, "PHYSDEVOP cmd=%d: not implemented\n", cmd);
|
||||
return -ENOSYS;
|
||||
}
|
||||
|
@ -0,0 +1,40 @@
|
||||
# Commit 1ef01396fdff88b1c3331a09ca5c69619b90f4ea
|
||||
# Date 2015-10-29 13:34:17 +0100
|
||||
# Author Ian Campbell <ian.campbell@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
arm: handle races between relinquish_memory and free_domheap_pages
|
||||
|
||||
Primarily this means XENMEM_decrease_reservation from a toolstack
|
||||
domain.
|
||||
|
||||
Unlike x86 we have no requirement right now to queue such pages onto
|
||||
a separate list, if we hit this race then the other code has already
|
||||
fully accepted responsibility for freeing this page and therefore
|
||||
there is no more for relinquish_memory to do.
|
||||
|
||||
This is CVE-2015-7814 / XSA-147.
|
||||
|
||||
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
Reviewed-by: Julien Grall <julien.grall@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/xen/arch/arm/domain.c
|
||||
+++ b/xen/arch/arm/domain.c
|
||||
@@ -768,8 +768,15 @@ static int relinquish_memory(struct doma
|
||||
{
|
||||
/* Grab a reference to the page so it won't disappear from under us. */
|
||||
if ( unlikely(!get_page(page, d)) )
|
||||
- /* Couldn't get a reference -- someone is freeing this page. */
|
||||
- BUG();
|
||||
+ /*
|
||||
+ * Couldn't get a reference -- someone is freeing this page and
|
||||
+ * has already committed to doing so, so no more to do here.
|
||||
+ *
|
||||
+ * Note that the page must be left on the list, a list_del
|
||||
+ * here will clash with the list_del done by the other
|
||||
+ * party in the race and corrupt the list head.
|
||||
+ */
|
||||
+ continue;
|
||||
|
||||
if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
|
||||
put_page(page);
|
@ -1,5 +1,7 @@
|
||||
References: bsc#950367 CVE-2015-7835 XSA-148
|
||||
|
||||
# Commit fe360c90ea13f309ef78810f1a2b92f2ae3b30b8
|
||||
# Date 2015-10-29 13:35:07 +0100
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: guard against undue super page PTE creation
|
||||
|
||||
When optional super page support got added (commit bd1cd81d64 "x86: PV
|
||||
@ -10,14 +12,13 @@ unconditionally.
|
||||
|
||||
This is CVE-2015-7835 / XSA-148.
|
||||
|
||||
Reported-by: "栾尚聪(好风)" <shangcong.lsc@alibaba-inc.com>
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Tim Deegan <tim@xen.org>
|
||||
|
||||
Index: xen-4.5.1-testing/xen/arch/x86/mm.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/xen/arch/x86/mm.c
|
||||
+++ xen-4.5.1-testing/xen/arch/x86/mm.c
|
||||
@@ -162,7 +162,10 @@ static void put_superpage(unsigned long
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -160,7 +160,10 @@ static void put_superpage(unsigned long
|
||||
static uint32_t base_disallow_mask;
|
||||
/* Global bit is allowed to be set on L1 PTEs. Intended for user mappings. */
|
||||
#define L1_DISALLOW_MASK ((base_disallow_mask | _PAGE_GNTTAB) & ~_PAGE_GLOBAL)
|
||||
@ -27,9 +28,9 @@ Index: xen-4.5.1-testing/xen/arch/x86/mm.c
|
||||
+ ? base_disallow_mask & ~_PAGE_PSE \
|
||||
+ : base_disallow_mask)
|
||||
|
||||
#define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
|
||||
base_disallow_mask : \
|
||||
@@ -1790,7 +1793,10 @@ static int mod_l2_entry(l2_pgentry_t *pl
|
||||
#define l3_disallow_mask(d) (!is_pv_32bit_domain(d) ? \
|
||||
base_disallow_mask : 0xFFFFF198U)
|
||||
@@ -1839,7 +1842,10 @@ static int mod_l2_entry(l2_pgentry_t *pl
|
||||
}
|
||||
|
||||
/* Fast path for identical mapping and presence. */
|
25
5632129c-free-domain-s-vcpu-array.patch
Normal file
25
5632129c-free-domain-s-vcpu-array.patch
Normal file
@ -0,0 +1,25 @@
|
||||
# Commit d46896ebbb23f3a9fef2eb6066ae614fd1acfd96
|
||||
# Date 2015-10-29 13:35:40 +0100
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
free domain's vcpu array
|
||||
|
||||
This was overlooked in fb442e2171 ("x86_64: allow more vCPU-s per
|
||||
guest").
|
||||
|
||||
This is CVE-2015-7969 / XSA-149.
|
||||
|
||||
Reported-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
|
||||
--- a/xen/common/domain.c
|
||||
+++ b/xen/common/domain.c
|
||||
@@ -833,6 +833,7 @@ static void complete_domain_destroy(stru
|
||||
|
||||
xsm_free_security_domain(d);
|
||||
free_cpumask_var(d->domain_dirty_cpumask);
|
||||
+ xfree(d->vcpu);
|
||||
free_domain_struct(d);
|
||||
|
||||
send_global_virq(VIRQ_DOM_EXC);
|
205
563212c9-x86-PoD-Eager-sweep-for-zeroed-pages.patch
Normal file
205
563212c9-x86-PoD-Eager-sweep-for-zeroed-pages.patch
Normal file
@ -0,0 +1,205 @@
|
||||
# Commit 101ce53266866144e724ed593173bc4098b300b9
|
||||
# Date 2015-10-29 13:36:25 +0100
|
||||
# Author Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/PoD: Eager sweep for zeroed pages
|
||||
|
||||
Based on the contents of a guests physical address space,
|
||||
p2m_pod_emergency_sweep() could degrade into a linear memcmp() from 0 to
|
||||
max_gfn, which runs non-preemptibly.
|
||||
|
||||
As p2m_pod_emergency_sweep() runs behind the scenes in a number of contexts,
|
||||
making it preemptible is not feasible.
|
||||
|
||||
Instead, a different approach is taken. Recently-populated pages are eagerly
|
||||
checked for reclaimation, which amortises the p2m_pod_emergency_sweep()
|
||||
operation across each p2m_pod_demand_populate() operation.
|
||||
|
||||
Note that in the case that a 2M superpage can't be reclaimed as a superpage,
|
||||
it is shattered if 4K pages of zeros can be reclaimed. This is unfortunate
|
||||
but matches the previous behaviour, and is required to avoid regressions
|
||||
(domain crash from PoD exhaustion) with VMs configured close to the limit.
|
||||
|
||||
This is CVE-2015-7970 / XSA-150.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/mm/p2m-pod.c
|
||||
+++ b/xen/arch/x86/mm/p2m-pod.c
|
||||
@@ -901,28 +901,6 @@ p2m_pod_zero_check(struct p2m_domain *p2
|
||||
}
|
||||
|
||||
#define POD_SWEEP_LIMIT 1024
|
||||
-
|
||||
-/* When populating a new superpage, look at recently populated superpages
|
||||
- * hoping that they've been zeroed. This will snap up zeroed pages as soon as
|
||||
- * the guest OS is done with them. */
|
||||
-static void
|
||||
-p2m_pod_check_last_super(struct p2m_domain *p2m, unsigned long gfn_aligned)
|
||||
-{
|
||||
- unsigned long check_gfn;
|
||||
-
|
||||
- ASSERT(p2m->pod.last_populated_index < POD_HISTORY_MAX);
|
||||
-
|
||||
- check_gfn = p2m->pod.last_populated[p2m->pod.last_populated_index];
|
||||
-
|
||||
- p2m->pod.last_populated[p2m->pod.last_populated_index] = gfn_aligned;
|
||||
-
|
||||
- p2m->pod.last_populated_index =
|
||||
- ( p2m->pod.last_populated_index + 1 ) % POD_HISTORY_MAX;
|
||||
-
|
||||
- p2m_pod_zero_check_superpage(p2m, check_gfn);
|
||||
-}
|
||||
-
|
||||
-
|
||||
#define POD_SWEEP_STRIDE 16
|
||||
static void
|
||||
p2m_pod_emergency_sweep(struct p2m_domain *p2m)
|
||||
@@ -963,7 +941,7 @@ p2m_pod_emergency_sweep(struct p2m_domai
|
||||
* NB that this is a zero-sum game; we're increasing our cache size
|
||||
* by re-increasing our 'debt'. Since we hold the pod lock,
|
||||
* (entry_count - count) must remain the same. */
|
||||
- if ( p2m->pod.count > 0 && i < limit )
|
||||
+ if ( i < limit && (p2m->pod.count > 0 || hypercall_preempt_check()) )
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -975,6 +953,58 @@ p2m_pod_emergency_sweep(struct p2m_domai
|
||||
|
||||
}
|
||||
|
||||
+static void pod_eager_reclaim(struct p2m_domain *p2m)
|
||||
+{
|
||||
+ struct pod_mrp_list *mrp = &p2m->pod.mrp;
|
||||
+ unsigned int i = 0;
|
||||
+
|
||||
+ /*
|
||||
+ * Always check one page for reclaimation.
|
||||
+ *
|
||||
+ * If the PoD pool is empty, keep checking some space is found, or all
|
||||
+ * entries have been exhaused.
|
||||
+ */
|
||||
+ do
|
||||
+ {
|
||||
+ unsigned int idx = (mrp->idx + i++) % ARRAY_SIZE(mrp->list);
|
||||
+ unsigned long gfn = mrp->list[idx];
|
||||
+
|
||||
+ if ( gfn != INVALID_GFN )
|
||||
+ {
|
||||
+ if ( gfn & POD_LAST_SUPERPAGE )
|
||||
+ {
|
||||
+ gfn &= ~POD_LAST_SUPERPAGE;
|
||||
+
|
||||
+ if ( p2m_pod_zero_check_superpage(p2m, gfn) == 0 )
|
||||
+ {
|
||||
+ unsigned int x;
|
||||
+
|
||||
+ for ( x = 0; x < SUPERPAGE_PAGES; ++x, ++gfn )
|
||||
+ p2m_pod_zero_check(p2m, &gfn, 1);
|
||||
+ }
|
||||
+ }
|
||||
+ else
|
||||
+ p2m_pod_zero_check(p2m, &gfn, 1);
|
||||
+
|
||||
+ mrp->list[idx] = INVALID_GFN;
|
||||
+ }
|
||||
+
|
||||
+ } while ( (p2m->pod.count == 0) && (i < ARRAY_SIZE(mrp->list)) );
|
||||
+}
|
||||
+
|
||||
+static void pod_eager_record(struct p2m_domain *p2m,
|
||||
+ unsigned long gfn, unsigned int order)
|
||||
+{
|
||||
+ struct pod_mrp_list *mrp = &p2m->pod.mrp;
|
||||
+
|
||||
+ ASSERT(mrp->list[mrp->idx] == INVALID_GFN);
|
||||
+ ASSERT(gfn != INVALID_GFN);
|
||||
+
|
||||
+ mrp->list[mrp->idx++] =
|
||||
+ gfn | (order == PAGE_ORDER_2M ? POD_LAST_SUPERPAGE : 0);
|
||||
+ mrp->idx %= ARRAY_SIZE(mrp->list);
|
||||
+}
|
||||
+
|
||||
int
|
||||
p2m_pod_demand_populate(struct p2m_domain *p2m, unsigned long gfn,
|
||||
unsigned int order,
|
||||
@@ -1015,6 +1045,8 @@ p2m_pod_demand_populate(struct p2m_domai
|
||||
return 0;
|
||||
}
|
||||
|
||||
+ pod_eager_reclaim(p2m);
|
||||
+
|
||||
/* Only sweep if we're actually out of memory. Doing anything else
|
||||
* causes unnecessary time and fragmentation of superpages in the p2m. */
|
||||
if ( p2m->pod.count == 0 )
|
||||
@@ -1051,6 +1083,8 @@ p2m_pod_demand_populate(struct p2m_domai
|
||||
p2m->pod.entry_count -= (1 << order);
|
||||
BUG_ON(p2m->pod.entry_count < 0);
|
||||
|
||||
+ pod_eager_record(p2m, gfn_aligned, order);
|
||||
+
|
||||
if ( tb_init_done )
|
||||
{
|
||||
struct {
|
||||
@@ -1066,12 +1100,6 @@ p2m_pod_demand_populate(struct p2m_domai
|
||||
__trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t);
|
||||
}
|
||||
|
||||
- /* Check the last guest demand-populate */
|
||||
- if ( p2m->pod.entry_count > p2m->pod.count
|
||||
- && (order == PAGE_ORDER_2M)
|
||||
- && (q & P2M_ALLOC) )
|
||||
- p2m_pod_check_last_super(p2m, gfn_aligned);
|
||||
-
|
||||
pod_unlock(p2m);
|
||||
return 0;
|
||||
out_of_memory:
|
||||
--- a/xen/arch/x86/mm/p2m.c
|
||||
+++ b/xen/arch/x86/mm/p2m.c
|
||||
@@ -60,6 +60,7 @@ boolean_param("hap_2mb", opt_hap_2mb);
|
||||
/* Init the datastructures for later use by the p2m code */
|
||||
static int p2m_initialise(struct domain *d, struct p2m_domain *p2m)
|
||||
{
|
||||
+ unsigned int i;
|
||||
int ret = 0;
|
||||
|
||||
mm_rwlock_init(&p2m->lock);
|
||||
@@ -75,6 +76,9 @@ static int p2m_initialise(struct domain
|
||||
|
||||
p2m->np2m_base = P2M_BASE_EADDR;
|
||||
|
||||
+ for ( i = 0; i < ARRAY_SIZE(p2m->pod.mrp.list); ++i )
|
||||
+ p2m->pod.mrp.list[i] = INVALID_GFN;
|
||||
+
|
||||
if ( hap_enabled(d) && cpu_has_vmx )
|
||||
ret = ept_p2m_init(p2m);
|
||||
else
|
||||
--- a/xen/include/asm-x86/p2m.h
|
||||
+++ b/xen/include/asm-x86/p2m.h
|
||||
@@ -292,10 +292,20 @@ struct p2m_domain {
|
||||
entry_count; /* # of pages in p2m marked pod */
|
||||
unsigned long reclaim_single; /* Last gpfn of a scan */
|
||||
unsigned long max_guest; /* gpfn of max guest demand-populate */
|
||||
-#define POD_HISTORY_MAX 128
|
||||
- /* gpfn of last guest superpage demand-populated */
|
||||
- unsigned long last_populated[POD_HISTORY_MAX];
|
||||
- unsigned int last_populated_index;
|
||||
+
|
||||
+ /*
|
||||
+ * Tracking of the most recently populated PoD pages, for eager
|
||||
+ * reclamation.
|
||||
+ */
|
||||
+ struct pod_mrp_list {
|
||||
+#define NR_POD_MRP_ENTRIES 32
|
||||
+
|
||||
+/* Encode ORDER_2M superpage in top bit of GFN */
|
||||
+#define POD_LAST_SUPERPAGE (INVALID_GFN & ~(INVALID_GFN >> 1))
|
||||
+
|
||||
+ unsigned long list[NR_POD_MRP_ENTRIES];
|
||||
+ unsigned int idx;
|
||||
+ } mrp;
|
||||
mm_lock_t lock; /* Locking of private pod structs, *
|
||||
* not relying on the p2m lock. */
|
||||
} pod;
|
@ -1,17 +1,19 @@
|
||||
# Commit 6e97c4b37386c2d09e09e9b5d5d232e37728b960
|
||||
# Date 2015-10-29 13:36:52 +0100
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
xenoprof: free domain's vcpu array
|
||||
|
||||
This was overlooked in fb442e2171 ("x86_64: allow more vCPU-s per
|
||||
guest").
|
||||
|
||||
This is XSA-151.
|
||||
This is CVE-2015-7969 / XSA-151.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
|
||||
Index: xen-4.5.1-testing/xen/common/xenoprof.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/xen/common/xenoprof.c
|
||||
+++ xen-4.5.1-testing/xen/common/xenoprof.c
|
||||
--- a/xen/common/xenoprof.c
|
||||
+++ b/xen/common/xenoprof.c
|
||||
@@ -239,6 +239,7 @@ static int alloc_xenoprof_struct(
|
||||
d->xenoprof->rawbuf = alloc_xenheap_pages(get_order_from_pages(npages), 0);
|
||||
if ( d->xenoprof->rawbuf == NULL )
|
@ -1,3 +1,7 @@
|
||||
# Commit 95e7415843b94c346e5ba8682665f508f220e04b
|
||||
# Date 2015-10-29 13:37:19 +0100
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86: rate-limit logging in do_xen{oprof,pmu}_op()
|
||||
|
||||
Some of the sub-ops are acessible to all guests, and hence should be
|
||||
@ -5,14 +9,37 @@ rate-limited. In the xenoprof case, just like for XSA-146, include them
|
||||
only in debug builds. Since the vPMU code is rather new, allow them to
|
||||
be always present, but downgrade them to (rate limited) guest messages.
|
||||
|
||||
This is XSA-152.
|
||||
This is CVE-2015-7971 / XSA-152.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
|
||||
Index: xen-4.5.1-testing/xen/common/xenoprof.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/xen/common/xenoprof.c
|
||||
+++ xen-4.5.1-testing/xen/common/xenoprof.c
|
||||
--- a/xen/arch/x86/cpu/vpmu.c
|
||||
+++ b/xen/arch/x86/cpu/vpmu.c
|
||||
@@ -682,8 +682,8 @@ long do_xenpmu_op(unsigned int op, XEN_G
|
||||
vpmu_mode = pmu_params.val;
|
||||
else if ( vpmu_mode != pmu_params.val )
|
||||
{
|
||||
- printk(XENLOG_WARNING
|
||||
- "VPMU: Cannot change mode while active VPMUs exist\n");
|
||||
+ gprintk(XENLOG_WARNING,
|
||||
+ "VPMU: Cannot change mode while active VPMUs exist\n");
|
||||
ret = -EBUSY;
|
||||
}
|
||||
|
||||
@@ -714,8 +714,8 @@ long do_xenpmu_op(unsigned int op, XEN_G
|
||||
vpmu_features = pmu_params.val;
|
||||
else
|
||||
{
|
||||
- printk(XENLOG_WARNING "VPMU: Cannot change features while"
|
||||
- " active VPMUs exist\n");
|
||||
+ gprintk(XENLOG_WARNING,
|
||||
+ "VPMU: Cannot change features while active VPMUs exist\n");
|
||||
ret = -EBUSY;
|
||||
}
|
||||
|
||||
--- a/xen/common/xenoprof.c
|
||||
+++ b/xen/common/xenoprof.c
|
||||
@@ -676,15 +676,13 @@ ret_t do_xenoprof_op(int op, XEN_GUEST_H
|
||||
|
||||
if ( (op < 0) || (op > XENOPROF_last_op) )
|
@ -1,7 +1,8 @@
|
||||
From 27593ec62bdad8621df910931349d964a6dbaa8c Mon Sep 17 00:00:00 2001
|
||||
From: Ian Jackson <ian.jackson@eu.citrix.com>
|
||||
Date: Wed, 21 Oct 2015 16:18:30 +0100
|
||||
Subject: [PATCH XSA-153 v3] libxl: adjust PoD target by memory fudge, too
|
||||
# Commit e294a0c3af9f4443dc692b180fb1771b1cb075e8
|
||||
# Date 2015-10-29 15:11:51 +0000
|
||||
# Author Ian Jackson <ian.jackson@eu.citrix.com>
|
||||
# Committer Ian Jackson <Ian.Jackson@eu.citrix.com>
|
||||
libxl: adjust PoD target by memory fudge, too
|
||||
|
||||
PoD guests need to balloon at least as far as required by PoD, or risk
|
||||
crashing. Currently they don't necessarily know what the right value
|
||||
@ -32,37 +33,30 @@ probably also in stable trees.
|
||||
This is XSA-153.
|
||||
|
||||
Signed-off-by: Ian Jackson <Ian.Jackson@eu.citrix.com>
|
||||
---
|
||||
tools/libxl/libxl.c | 2 +-
|
||||
tools/libxl/libxl_dom.c | 9 ++++++++-
|
||||
2 files changed, 9 insertions(+), 2 deletions(-)
|
||||
(cherry picked from commit 56fb5fd62320eb40a7517206f9706aa9188d6f7b)
|
||||
|
||||
Index: xen-4.5.1-testing/tools/libxl/libxl.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/libxl/libxl.c
|
||||
+++ xen-4.5.1-testing/tools/libxl/libxl.c
|
||||
@@ -4859,7 +4859,7 @@ retry_transaction:
|
||||
--- a/tools/libxl/libxl.c
|
||||
+++ b/tools/libxl/libxl.c
|
||||
@@ -4815,7 +4815,7 @@ retry_transaction:
|
||||
}
|
||||
|
||||
new_target_memkb -= videoram;
|
||||
rc = xc_domain_set_pod_target(ctx->xch, domid,
|
||||
- new_target_memkb / 4, NULL, NULL, NULL);
|
||||
+ (new_target_memkb + LIBXL_MAXMEM_CONSTANT) / 4, NULL, NULL, NULL);
|
||||
if (rc != 0) {
|
||||
LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR,
|
||||
"xc_domain_set_pod_target domid=%d, memkb=%d "
|
||||
Index: xen-4.5.1-testing/tools/libxl/libxl_dom.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/libxl/libxl_dom.c
|
||||
+++ xen-4.5.1-testing/tools/libxl/libxl_dom.c
|
||||
@@ -446,6 +446,7 @@ int libxl__build_post(libxl__gc *gc, uin
|
||||
--- a/tools/libxl/libxl_dom.c
|
||||
+++ b/tools/libxl/libxl_dom.c
|
||||
@@ -486,6 +486,7 @@ int libxl__build_post(libxl__gc *gc, uin
|
||||
xs_transaction_t t;
|
||||
char **ents;
|
||||
int i, rc;
|
||||
+ int64_t mem_target_fudge;
|
||||
|
||||
rc = libxl_domain_sched_params_set(CTX, domid, &info->sched_params);
|
||||
if (rc)
|
||||
@@ -472,11 +473,17 @@ int libxl__build_post(libxl__gc *gc, uin
|
||||
if (info->num_vnuma_nodes && !info->num_vcpu_soft_affinity) {
|
||||
rc = set_vnuma_affinity(gc, domid, info);
|
||||
@@ -518,11 +519,17 @@ int libxl__build_post(libxl__gc *gc, uin
|
||||
}
|
||||
}
|
||||
|
88
56377442-x86-PoD-Make-p2m_pod_empty_cache-restartable.patch
Normal file
88
56377442-x86-PoD-Make-p2m_pod_empty_cache-restartable.patch
Normal file
@ -0,0 +1,88 @@
|
||||
# Commit 59a5061723ba47c0028cf48487e5de551c42a378
|
||||
# Date 2015-11-02 15:33:38 +0100
|
||||
# Author Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/PoD: Make p2m_pod_empty_cache() restartable
|
||||
|
||||
This avoids a long running operation when destroying a domain with a
|
||||
large PoD cache.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/mm/p2m-pod.c
|
||||
+++ b/xen/arch/x86/mm/p2m-pod.c
|
||||
@@ -375,11 +375,11 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
-void
|
||||
-p2m_pod_empty_cache(struct domain *d)
|
||||
+int p2m_pod_empty_cache(struct domain *d)
|
||||
{
|
||||
struct p2m_domain *p2m = p2m_get_hostp2m(d);
|
||||
struct page_info *page;
|
||||
+ unsigned int i;
|
||||
|
||||
/* After this barrier no new PoD activities can happen. */
|
||||
BUG_ON(!d->is_dying);
|
||||
@@ -389,8 +389,6 @@ p2m_pod_empty_cache(struct domain *d)
|
||||
|
||||
while ( (page = page_list_remove_head(&p2m->pod.super)) )
|
||||
{
|
||||
- int i;
|
||||
-
|
||||
for ( i = 0 ; i < SUPERPAGE_PAGES ; i++ )
|
||||
{
|
||||
BUG_ON(page_get_owner(page + i) != d);
|
||||
@@ -398,19 +396,27 @@ p2m_pod_empty_cache(struct domain *d)
|
||||
}
|
||||
|
||||
p2m->pod.count -= SUPERPAGE_PAGES;
|
||||
+
|
||||
+ if ( hypercall_preempt_check() )
|
||||
+ goto out;
|
||||
}
|
||||
|
||||
- while ( (page = page_list_remove_head(&p2m->pod.single)) )
|
||||
+ for ( i = 0; (page = page_list_remove_head(&p2m->pod.single)); ++i )
|
||||
{
|
||||
BUG_ON(page_get_owner(page) != d);
|
||||
page_list_add_tail(page, &d->page_list);
|
||||
|
||||
p2m->pod.count -= 1;
|
||||
+
|
||||
+ if ( i && !(i & 511) && hypercall_preempt_check() )
|
||||
+ goto out;
|
||||
}
|
||||
|
||||
BUG_ON(p2m->pod.count != 0);
|
||||
|
||||
+ out:
|
||||
unlock_page_alloc(p2m);
|
||||
+ return p2m->pod.count ? -ERESTART : 0;
|
||||
}
|
||||
|
||||
int
|
||||
--- a/xen/arch/x86/mm/paging.c
|
||||
+++ b/xen/arch/x86/mm/paging.c
|
||||
@@ -815,7 +815,7 @@ int paging_teardown(struct domain *d)
|
||||
return rc;
|
||||
|
||||
/* Move populate-on-demand cache back to domain_list for destruction */
|
||||
- p2m_pod_empty_cache(d);
|
||||
+ rc = p2m_pod_empty_cache(d);
|
||||
|
||||
return rc;
|
||||
}
|
||||
--- a/xen/include/asm-x86/p2m.h
|
||||
+++ b/xen/include/asm-x86/p2m.h
|
||||
@@ -588,7 +588,7 @@ void p2m_pod_dump_data(struct domain *d)
|
||||
|
||||
/* Move all pages from the populate-on-demand cache to the domain page_list
|
||||
* (usually in preparation for domain destruction) */
|
||||
-void p2m_pod_empty_cache(struct domain *d);
|
||||
+int p2m_pod_empty_cache(struct domain *d);
|
||||
|
||||
/* Set populate-on-demand cache size so that the total memory allocated to a
|
||||
* domain matches target */
|
134
5641ceec-x86-HVM-always-intercept-AC-and-DB.patch
Normal file
134
5641ceec-x86-HVM-always-intercept-AC-and-DB.patch
Normal file
@ -0,0 +1,134 @@
|
||||
# Commit bd2239d9fa975a1ee5bcd27c218ae042cd0a57bc
|
||||
# Date 2015-11-10 12:03:08 +0100
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/HVM: always intercept #AC and #DB
|
||||
|
||||
Both being benign exceptions, and both being possible to get triggered
|
||||
by exception delivery, this is required to prevent a guest from locking
|
||||
up a CPU (resulting from no other VM exits occurring once getting into
|
||||
such a loop).
|
||||
|
||||
The specific scenarios:
|
||||
|
||||
1) #AC may be raised during exception delivery if the handler is set to
|
||||
be a ring-3 one by a 32-bit guest, and the stack is misaligned.
|
||||
|
||||
This is CVE-2015-5307 / XSA-156.
|
||||
|
||||
Reported-by: Benjamin Serebrin <serebrin@google.com>
|
||||
|
||||
2) #DB may be raised during exception delivery when a breakpoint got
|
||||
placed on a data structure involved in delivering the exception. This
|
||||
can result in an endless loop when a 64-bit guest uses a non-zero IST
|
||||
for the vector 1 IDT entry, but even without use of IST the time it
|
||||
takes until a contributory fault would get raised (results depending
|
||||
on the handler) may be quite long.
|
||||
|
||||
This is CVE-2015-8104 / XSA-156.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Tested-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/svm/svm.c
|
||||
+++ b/xen/arch/x86/hvm/svm/svm.c
|
||||
@@ -1043,10 +1043,11 @@ static void noreturn svm_do_resume(struc
|
||||
unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
|
||||
{
|
||||
uint32_t intercepts = vmcb_get_exception_intercepts(vmcb);
|
||||
- uint32_t mask = (1U << TRAP_debug) | (1U << TRAP_int3);
|
||||
+
|
||||
v->arch.hvm_vcpu.debug_state_latch = debug_state;
|
||||
vmcb_set_exception_intercepts(
|
||||
- vmcb, debug_state ? (intercepts | mask) : (intercepts & ~mask));
|
||||
+ vmcb, debug_state ? (intercepts | (1U << TRAP_int3))
|
||||
+ : (intercepts & ~(1U << TRAP_int3)));
|
||||
}
|
||||
|
||||
if ( v->arch.hvm_svm.launch_core != smp_processor_id() )
|
||||
@@ -2434,8 +2435,9 @@ void svm_vmexit_handler(struct cpu_user_
|
||||
|
||||
case VMEXIT_EXCEPTION_DB:
|
||||
if ( !v->domain->debugger_attached )
|
||||
- goto unexpected_exit_type;
|
||||
- domain_pause_for_debugger();
|
||||
+ hvm_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE);
|
||||
+ else
|
||||
+ domain_pause_for_debugger();
|
||||
break;
|
||||
|
||||
case VMEXIT_EXCEPTION_BP:
|
||||
@@ -2483,6 +2485,11 @@ void svm_vmexit_handler(struct cpu_user_
|
||||
break;
|
||||
}
|
||||
|
||||
+ case VMEXIT_EXCEPTION_AC:
|
||||
+ HVMTRACE_1D(TRAP, TRAP_alignment_check);
|
||||
+ hvm_inject_hw_exception(TRAP_alignment_check, vmcb->exitinfo1);
|
||||
+ break;
|
||||
+
|
||||
case VMEXIT_EXCEPTION_UD:
|
||||
svm_vmexit_ud_intercept(regs);
|
||||
break;
|
||||
--- a/xen/arch/x86/hvm/vmx/vmx.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vmx.c
|
||||
@@ -1224,16 +1224,10 @@ static void vmx_update_host_cr3(struct v
|
||||
|
||||
void vmx_update_debug_state(struct vcpu *v)
|
||||
{
|
||||
- unsigned long mask;
|
||||
-
|
||||
- mask = 1u << TRAP_int3;
|
||||
- if ( !cpu_has_monitor_trap_flag )
|
||||
- mask |= 1u << TRAP_debug;
|
||||
-
|
||||
if ( v->arch.hvm_vcpu.debug_state_latch )
|
||||
- v->arch.hvm_vmx.exception_bitmap |= mask;
|
||||
+ v->arch.hvm_vmx.exception_bitmap |= 1U << TRAP_int3;
|
||||
else
|
||||
- v->arch.hvm_vmx.exception_bitmap &= ~mask;
|
||||
+ v->arch.hvm_vmx.exception_bitmap &= ~(1U << TRAP_int3);
|
||||
|
||||
vmx_vmcs_enter(v);
|
||||
vmx_update_exception_bitmap(v);
|
||||
@@ -3041,9 +3035,10 @@ void vmx_vmexit_handler(struct cpu_user_
|
||||
__vmread(EXIT_QUALIFICATION, &exit_qualification);
|
||||
HVMTRACE_1D(TRAP_DEBUG, exit_qualification);
|
||||
write_debugreg(6, exit_qualification | DR_STATUS_RESERVED_ONE);
|
||||
- if ( !v->domain->debugger_attached || cpu_has_monitor_trap_flag )
|
||||
- goto exit_and_crash;
|
||||
- domain_pause_for_debugger();
|
||||
+ if ( !v->domain->debugger_attached )
|
||||
+ hvm_inject_hw_exception(vector, HVM_DELIVER_NO_ERROR_CODE);
|
||||
+ else
|
||||
+ domain_pause_for_debugger();
|
||||
break;
|
||||
case TRAP_int3:
|
||||
{
|
||||
@@ -3108,6 +3103,11 @@ void vmx_vmexit_handler(struct cpu_user_
|
||||
|
||||
hvm_inject_page_fault(regs->error_code, exit_qualification);
|
||||
break;
|
||||
+ case TRAP_alignment_check:
|
||||
+ HVMTRACE_1D(TRAP, vector);
|
||||
+ __vmread(VM_EXIT_INTR_ERROR_CODE, &ecode);
|
||||
+ hvm_inject_hw_exception(vector, ecode);
|
||||
+ break;
|
||||
case TRAP_nmi:
|
||||
if ( MASK_EXTR(intr_info, INTR_INFO_INTR_TYPE_MASK) !=
|
||||
X86_EVENTTYPE_NMI )
|
||||
--- a/xen/include/asm-x86/hvm/hvm.h
|
||||
+++ b/xen/include/asm-x86/hvm/hvm.h
|
||||
@@ -384,7 +384,10 @@ static inline int hvm_event_pending(stru
|
||||
(X86_CR4_VMXE | X86_CR4_PAE | X86_CR4_MCE))
|
||||
|
||||
/* These exceptions must always be intercepted. */
|
||||
-#define HVM_TRAP_MASK ((1U << TRAP_machine_check) | (1U << TRAP_invalid_op))
|
||||
+#define HVM_TRAP_MASK ((1U << TRAP_debug) | \
|
||||
+ (1U << TRAP_invalid_op) | \
|
||||
+ (1U << TRAP_alignment_check) | \
|
||||
+ (1U << TRAP_machine_check))
|
||||
|
||||
/*
|
||||
* x86 event types. This enumeration is valid for:
|
20
5644b756-x86-HVM-don-t-inject-DB-with-error-code.patch
Normal file
20
5644b756-x86-HVM-don-t-inject-DB-with-error-code.patch
Normal file
@ -0,0 +1,20 @@
|
||||
# Commit 057e0e72d2a5d598087c5f167ec6a13203a3cf65
|
||||
# Date 2015-11-12 16:59:18 +0100
|
||||
# Author Jan Beulich <jbeulich@suse.com>
|
||||
# Committer Jan Beulich <jbeulich@suse.com>
|
||||
x86/HVM: don't inject #DB with error code
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/hvm/hvm.c
|
||||
+++ b/xen/arch/x86/hvm/hvm.c
|
||||
@@ -4071,7 +4071,7 @@ void hvm_task_switch(
|
||||
goto out;
|
||||
|
||||
if ( (tss.trace & 1) && !exn_raised )
|
||||
- hvm_inject_hw_exception(TRAP_debug, tss_sel & 0xfff8);
|
||||
+ hvm_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE);
|
||||
|
||||
tr.attr.fields.type = 0xb; /* busy 32-bit tss */
|
||||
hvm_set_segment_register(v, x86_seg_tr, &tr);
|
@ -21,11 +21,11 @@ Cc: qemu-stable@nongnu.org
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
Reviewed-by: Benoit Canet <benoit@irqsave.net>
|
||||
|
||||
Index: xen-4.5.1-testing/tools/qemu-xen-dir-remote/block/qcow.c
|
||||
Index: xen-4.6.0-testing/tools/qemu-xen-dir-remote/block/qcow.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/qemu-xen-dir-remote/block/qcow.c
|
||||
+++ xen-4.5.1-testing/tools/qemu-xen-dir-remote/block/qcow.c
|
||||
@@ -147,6 +147,14 @@ static int qcow_open(BlockDriverState *b
|
||||
--- xen-4.6.0-testing.orig/tools/qemu-xen-dir-remote/block/qcow.c
|
||||
+++ xen-4.6.0-testing/tools/qemu-xen-dir-remote/block/qcow.c
|
||||
@@ -148,6 +148,14 @@ static int qcow_open(BlockDriverState *b
|
||||
goto fail;
|
||||
}
|
||||
|
||||
|
@ -1,216 +0,0 @@
|
||||
xl: Sane handling of extra config file arguments
|
||||
|
||||
Various xl sub-commands take additional parameters containing = as
|
||||
additional config fragments.
|
||||
|
||||
The handling of these config fragments has a number of bugs:
|
||||
|
||||
1. Use of a static 1024-byte buffer. (If truncation would occur,
|
||||
with semi-trusted input, a security risk arises due to quotes
|
||||
being lost.)
|
||||
|
||||
2. Mishandling of the return value from snprintf, so that if
|
||||
truncation occurs, the to-write pointer is updated with the
|
||||
wanted-to-write length, resulting in stack corruption. (This is
|
||||
XSA-137.)
|
||||
|
||||
3. Clone-and-hack of the code for constructing the appended
|
||||
config file.
|
||||
|
||||
These are fixed here, by introducing a new function
|
||||
`string_realloc_append' and using it everywhere. The `extra_info'
|
||||
buffers are replaced by pointers, which start off NULL and are
|
||||
explicitly freed on all return paths.
|
||||
|
||||
The separate variable which will become dom_info.extra_config is
|
||||
abolished (which involves moving the clearing of dom_info).
|
||||
|
||||
Additional bugs I observe, not fixed here:
|
||||
|
||||
4. The functions which now call string_realloc_append use ad-hoc
|
||||
error returns, with multiple calls to `return'. This currently
|
||||
necessitates multiple new calls to `free'.
|
||||
|
||||
5. Many of the paths in xl call exit(-rc) where rc is a libxl status
|
||||
code. This is a ridiculous exit status `convention'.
|
||||
|
||||
6. The loops for handling extra config data are clone-and-hacks.
|
||||
|
||||
7. Once the extra config buffer is accumulated, it must be combined
|
||||
with the appropriate main config file. The code to do this
|
||||
combining is clone-and-hacked too.
|
||||
|
||||
Signed-off-by: Ian Jackson <Ian.Jackson@eu.citrix.com>
|
||||
Tested-by: Ian Jackson <Ian.Jackson@eu.citrix.com>
|
||||
Acked-by: Ian Campbell <ian,campbell@citrix.com>
|
||||
|
||||
--- a/tools/libxl/xl_cmdimpl.c
|
||||
+++ b/tools/libxl/xl_cmdimpl.c
|
||||
@@ -151,7 +151,7 @@ struct domain_create {
|
||||
int console_autoconnect;
|
||||
int checkpointed_stream;
|
||||
const char *config_file;
|
||||
- const char *extra_config; /* extra config string */
|
||||
+ char *extra_config; /* extra config string */
|
||||
const char *restore_file;
|
||||
int migrate_fd; /* -1 means none */
|
||||
char **migration_domname_r; /* from malloc */
|
||||
@@ -4572,11 +4572,25 @@ int main_vm_list(int argc, char **argv)
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static void string_realloc_append(char **accumulate, const char *more)
|
||||
+{
|
||||
+ /* Appends more to accumulate. Accumulate is either NULL, or
|
||||
+ * points (always) to a malloc'd nul-terminated string. */
|
||||
+
|
||||
+ size_t oldlen = *accumulate ? strlen(*accumulate) : 0;
|
||||
+ size_t morelen = strlen(more) + 1/*nul*/;
|
||||
+ if (oldlen > SSIZE_MAX || morelen > SSIZE_MAX - oldlen) {
|
||||
+ fprintf(stderr,"Additional config data far too large\n");
|
||||
+ exit(-ERROR_FAIL);
|
||||
+ }
|
||||
+
|
||||
+ *accumulate = xrealloc(*accumulate, oldlen + morelen);
|
||||
+ memcpy(*accumulate + oldlen, more, morelen);
|
||||
+}
|
||||
+
|
||||
int main_create(int argc, char **argv)
|
||||
{
|
||||
const char *filename = NULL;
|
||||
- char *p;
|
||||
- char extra_config[1024];
|
||||
struct domain_create dom_info;
|
||||
int paused = 0, debug = 0, daemonize = 1, console_autoconnect = 0,
|
||||
quiet = 0, monitor = 1, vnc = 0, vncautopass = 0;
|
||||
@@ -4591,6 +4605,8 @@ int main_create(int argc, char **argv)
|
||||
{0, 0, 0, 0}
|
||||
};
|
||||
|
||||
+ dom_info.extra_config = NULL;
|
||||
+
|
||||
if (argv[1] && argv[1][0] != '-' && !strchr(argv[1], '=')) {
|
||||
filename = argv[1];
|
||||
argc--; argv++;
|
||||
@@ -4630,20 +4646,21 @@ int main_create(int argc, char **argv)
|
||||
break;
|
||||
}
|
||||
|
||||
- extra_config[0] = '\0';
|
||||
- for (p = extra_config; optind < argc; optind++) {
|
||||
+ memset(&dom_info, 0, sizeof(dom_info));
|
||||
+
|
||||
+ for (; optind < argc; optind++) {
|
||||
if (strchr(argv[optind], '=') != NULL) {
|
||||
- p += snprintf(p, sizeof(extra_config) - (p - extra_config),
|
||||
- "%s\n", argv[optind]);
|
||||
+ string_realloc_append(&dom_info.extra_config, argv[optind]);
|
||||
+ string_realloc_append(&dom_info.extra_config, "\n");
|
||||
} else if (!filename) {
|
||||
filename = argv[optind];
|
||||
} else {
|
||||
help("create");
|
||||
+ free(dom_info.extra_config);
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
- memset(&dom_info, 0, sizeof(dom_info));
|
||||
dom_info.debug = debug;
|
||||
dom_info.daemonize = daemonize;
|
||||
dom_info.monitor = monitor;
|
||||
@@ -4651,16 +4668,18 @@ int main_create(int argc, char **argv)
|
||||
dom_info.dryrun = dryrun_only;
|
||||
dom_info.quiet = quiet;
|
||||
dom_info.config_file = filename;
|
||||
- dom_info.extra_config = extra_config;
|
||||
dom_info.migrate_fd = -1;
|
||||
dom_info.vnc = vnc;
|
||||
dom_info.vncautopass = vncautopass;
|
||||
dom_info.console_autoconnect = console_autoconnect;
|
||||
|
||||
rc = create_domain(&dom_info);
|
||||
- if (rc < 0)
|
||||
+ if (rc < 0) {
|
||||
+ free(dom_info.extra_config);
|
||||
return -rc;
|
||||
+ }
|
||||
|
||||
+ free(dom_info.extra_config);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -4668,8 +4687,7 @@ int main_config_update(int argc, char **
|
||||
{
|
||||
uint32_t domid;
|
||||
const char *filename = NULL;
|
||||
- char *p;
|
||||
- char extra_config[1024];
|
||||
+ char *extra_config = NULL;
|
||||
void *config_data = 0;
|
||||
int config_len = 0;
|
||||
libxl_domain_config d_config;
|
||||
@@ -4707,15 +4725,15 @@ int main_config_update(int argc, char **
|
||||
break;
|
||||
}
|
||||
|
||||
- extra_config[0] = '\0';
|
||||
- for (p = extra_config; optind < argc; optind++) {
|
||||
+ for (; optind < argc; optind++) {
|
||||
if (strchr(argv[optind], '=') != NULL) {
|
||||
- p += snprintf(p, sizeof(extra_config) - (p - extra_config),
|
||||
- "%s\n", argv[optind]);
|
||||
+ string_realloc_append(&extra_config, argv[optind]);
|
||||
+ string_realloc_append(&extra_config, "\n");
|
||||
} else if (!filename) {
|
||||
filename = argv[optind];
|
||||
} else {
|
||||
help("create");
|
||||
+ free(extra_config);
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
@@ -4724,7 +4742,8 @@ int main_config_update(int argc, char **
|
||||
rc = libxl_read_file_contents(ctx, filename,
|
||||
&config_data, &config_len);
|
||||
if (rc) { fprintf(stderr, "Failed to read config file: %s: %s\n",
|
||||
- filename, strerror(errno)); return ERROR_FAIL; }
|
||||
+ filename, strerror(errno));
|
||||
+ free(extra_config); return ERROR_FAIL; }
|
||||
if (strlen(extra_config)) {
|
||||
if (config_len > INT_MAX - (strlen(extra_config) + 2 + 1)) {
|
||||
fprintf(stderr, "Failed to attach extra configration\n");
|
||||
@@ -4765,7 +4784,7 @@ int main_config_update(int argc, char **
|
||||
libxl_domain_config_dispose(&d_config);
|
||||
|
||||
free(config_data);
|
||||
-
|
||||
+ free(extra_config);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -7022,7 +7041,7 @@ int main_cpupoolcreate(int argc, char **
|
||||
{
|
||||
const char *filename = NULL, *config_src=NULL;
|
||||
const char *p;
|
||||
- char extra_config[1024];
|
||||
+ char *extra_config = NULL;
|
||||
int opt;
|
||||
static struct option opts[] = {
|
||||
{"defconfig", 1, 0, 'f'},
|
||||
@@ -7056,13 +7075,10 @@ int main_cpupoolcreate(int argc, char **
|
||||
break;
|
||||
}
|
||||
|
||||
- memset(extra_config, 0, sizeof(extra_config));
|
||||
while (optind < argc) {
|
||||
if ((p = strchr(argv[optind], '='))) {
|
||||
- if (strlen(extra_config) + 1 + strlen(argv[optind]) < sizeof(extra_config)) {
|
||||
- strcat(extra_config, "\n");
|
||||
- strcat(extra_config, argv[optind]);
|
||||
- }
|
||||
+ string_realloc_append(&extra_config, "\n");
|
||||
+ string_realloc_append(&extra_config, argv[optind]);
|
||||
} else if (!filename) {
|
||||
filename = argv[optind];
|
||||
} else {
|
@ -1,37 +0,0 @@
|
||||
tools: libxl: allow permissive qemu-upstream pci passthrough
|
||||
|
||||
Since XSA-131 qemu-xen now restricts access to PCI cfg by default. In
|
||||
order to allow local configuration of the existing libxl_device_pci
|
||||
"permissive" flag needs to be plumbed through via the new QMP property
|
||||
added by the XSA-131 patches.
|
||||
|
||||
Versions of QEMU prior to XSA-131 did not support this permissive
|
||||
property, so we only pass it if it is true. Older versions only
|
||||
supported permissive mode.
|
||||
|
||||
qemu-xen-traditional already supports the permissive mode setting via
|
||||
xenstore.
|
||||
|
||||
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
|
||||
|
||||
--- a/tools/libxl/libxl_qmp.c
|
||||
+++ b/tools/libxl/libxl_qmp.c
|
||||
@@ -835,6 +835,18 @@ int libxl__qmp_pci_add(libxl__gc *gc, in
|
||||
QMP_PARAMETERS_SPRINTF(&args, "addr", "%x.%x",
|
||||
PCI_SLOT(pcidev->vdevfn), PCI_FUNC(pcidev->vdevfn));
|
||||
}
|
||||
+ /*
|
||||
+ * Version of QEMU prior to the XSA-131 fix did not support this
|
||||
+ * property and were effectively always in permissive mode. The
|
||||
+ * fix for XSA-131 switched the default to be restricted by
|
||||
+ * default and added the permissive property.
|
||||
+ *
|
||||
+ * Therefore in order to support both old and new QEMU we only set
|
||||
+ * the permissive flag if it is true. Users of older QEMU have no
|
||||
+ * reason to set the flag so this is ok.
|
||||
+ */
|
||||
+ if (pcidev->permissive)
|
||||
+ qmp_parameters_add_bool(gc, &args, "permissive", true);
|
||||
|
||||
rc = qmp_synchronous_send(qmp, "device_add", args,
|
||||
NULL, NULL, qmp->timeout);
|
@ -1,74 +0,0 @@
|
||||
From a9de14175548c04e0f8be7fae219246509ba46a9 Mon Sep 17 00:00:00 2001
|
||||
From: Kevin Wolf <kwolf@redhat.com>
|
||||
Date: Wed, 3 Jun 2015 14:13:31 +0200
|
||||
Subject: [PATCH 1/3] ide: Check array bounds before writing to io_buffer
|
||||
(CVE-2015-5154)
|
||||
|
||||
If the end_transfer_func of a command is called because enough data has
|
||||
been read or written for the current PIO transfer, and it fails to
|
||||
correctly call the command completion functions, the DRQ bit in the
|
||||
status register and s->end_transfer_func may remain set. This allows the
|
||||
guest to access further bytes in s->io_buffer beyond s->data_end, and
|
||||
eventually overflowing the io_buffer.
|
||||
|
||||
One case where this currently happens is emulation of the ATAPI command
|
||||
START STOP UNIT.
|
||||
|
||||
This patch fixes the problem by adding explicit array bounds checks
|
||||
before accessing the buffer instead of relying on end_transfer_func to
|
||||
function correctly.
|
||||
|
||||
Cc: qemu-stable@nongnu.org
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
---
|
||||
hw/ide/core.c | 16 ++++++++++++++++
|
||||
1 file changed, 16 insertions(+)
|
||||
|
||||
Index: xen-4.2.5-testing/tools/qemu-xen-traditional-dir-remote/hw/ide.c
|
||||
===================================================================
|
||||
--- xen-4.2.5-testing.orig/tools/qemu-xen-traditional-dir-remote/hw/ide.c
|
||||
+++ xen-4.2.5-testing/tools/qemu-xen-traditional-dir-remote/hw/ide.c
|
||||
@@ -3002,6 +3002,10 @@ static void ide_data_writew(void *opaque
|
||||
buffered_pio_write(s, addr, 2);
|
||||
|
||||
p = s->data_ptr;
|
||||
+ if (p + 2 > s->data_end) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
*(uint16_t *)p = le16_to_cpu(val);
|
||||
p += 2;
|
||||
s->data_ptr = p;
|
||||
@@ -3021,6 +3025,10 @@ static uint32_t ide_data_readw(void *opa
|
||||
buffered_pio_read(s, addr, 2);
|
||||
|
||||
p = s->data_ptr;
|
||||
+ if (p + 2 > s->data_end) {
|
||||
+ return 0;
|
||||
+ }
|
||||
+
|
||||
ret = cpu_to_le16(*(uint16_t *)p);
|
||||
p += 2;
|
||||
s->data_ptr = p;
|
||||
@@ -3040,6 +3048,10 @@ static void ide_data_writel(void *opaque
|
||||
buffered_pio_write(s, addr, 4);
|
||||
|
||||
p = s->data_ptr;
|
||||
+ if (p + 4 > s->data_end) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
*(uint32_t *)p = le32_to_cpu(val);
|
||||
p += 4;
|
||||
s->data_ptr = p;
|
||||
@@ -3059,6 +3071,10 @@ static uint32_t ide_data_readl(void *opa
|
||||
buffered_pio_read(s, addr, 4);
|
||||
|
||||
p = s->data_ptr;
|
||||
+ if (p + 4 > s->data_end) {
|
||||
+ return 0;
|
||||
+ }
|
||||
+
|
||||
ret = cpu_to_le32(*(uint32_t *)p);
|
||||
p += 4;
|
||||
s->data_ptr = p;
|
@ -1,68 +0,0 @@
|
||||
From 1d3c2268f8708126a34064c2e0c1000b40e6f3e5 Mon Sep 17 00:00:00 2001
|
||||
From: Kevin Wolf <kwolf@redhat.com>
|
||||
Date: Wed, 3 Jun 2015 14:41:27 +0200
|
||||
Subject: [PATCH 3/3] ide: Clear DRQ after handling all expected accesses
|
||||
|
||||
This is additional hardening against an end_transfer_func that fails to
|
||||
clear the DRQ status bit. The bit must be unset as soon as the PIO
|
||||
transfer has completed, so it's better to do this in a central place
|
||||
instead of duplicating the code in all commands (and forgetting it in
|
||||
some).
|
||||
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
---
|
||||
hw/ide/core.c | 16 ++++++++++++----
|
||||
1 file changed, 12 insertions(+), 4 deletions(-)
|
||||
|
||||
Index: xen-4.2.5-testing/tools/qemu-xen-traditional-dir-remote/hw/ide.c
|
||||
===================================================================
|
||||
--- xen-4.2.5-testing.orig/tools/qemu-xen-traditional-dir-remote/hw/ide.c
|
||||
+++ xen-4.2.5-testing/tools/qemu-xen-traditional-dir-remote/hw/ide.c
|
||||
@@ -3016,8 +3016,10 @@ static void ide_data_writew(void *opaque
|
||||
*(uint16_t *)p = le16_to_cpu(val);
|
||||
p += 2;
|
||||
s->data_ptr = p;
|
||||
- if (p >= s->data_end)
|
||||
+ if (p >= s->data_end) {
|
||||
+ s->status &= ~DRQ_STAT;
|
||||
s->end_transfer_func(s);
|
||||
+ }
|
||||
}
|
||||
|
||||
static uint32_t ide_data_readw(void *opaque, uint32_t addr)
|
||||
@@ -3039,8 +3041,10 @@ static uint32_t ide_data_readw(void *opa
|
||||
ret = cpu_to_le16(*(uint16_t *)p);
|
||||
p += 2;
|
||||
s->data_ptr = p;
|
||||
- if (p >= s->data_end)
|
||||
+ if (p >= s->data_end) {
|
||||
+ s->status &= ~DRQ_STAT;
|
||||
s->end_transfer_func(s);
|
||||
+ }
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -3062,8 +3066,10 @@ static void ide_data_writel(void *opaque
|
||||
*(uint32_t *)p = le32_to_cpu(val);
|
||||
p += 4;
|
||||
s->data_ptr = p;
|
||||
- if (p >= s->data_end)
|
||||
+ if (p >= s->data_end) {
|
||||
+ s->status &= ~DRQ_STAT;
|
||||
s->end_transfer_func(s);
|
||||
+ }
|
||||
}
|
||||
|
||||
static uint32_t ide_data_readl(void *opaque, uint32_t addr)
|
||||
@@ -3085,8 +3091,10 @@ static uint32_t ide_data_readl(void *opa
|
||||
ret = cpu_to_le32(*(uint32_t *)p);
|
||||
p += 4;
|
||||
s->data_ptr = p;
|
||||
- if (p >= s->data_end)
|
||||
+ if (p >= s->data_end) {
|
||||
+ s->status &= ~DRQ_STAT;
|
||||
s->end_transfer_func(s);
|
||||
+ }
|
||||
return ret;
|
||||
}
|
||||
|
@ -1,54 +0,0 @@
|
||||
Subject: ATAPI: STARTSTOPUNIT only eject/load media if powercondition is 0
|
||||
From: Ronnie Sahlberg ronniesahlberg@gmail.com Tue Jul 31 11:28:26 2012 +1000
|
||||
Date: Wed Sep 12 15:50:09 2012 +0200:
|
||||
Git: ce560dcf20c14194db5ef3b9fc1ea592d4e68109
|
||||
|
||||
The START STOP UNIT command will only eject/load media if
|
||||
power condition is zero.
|
||||
|
||||
If power condition is !0 then LOEJ and START will be ignored.
|
||||
|
||||
From MMC (sbc contains similar wordings too)
|
||||
The Power Conditions field requests the block device to be placed
|
||||
in the power condition defined in
|
||||
Table 558. If this field has a value other than 0h then the Start
|
||||
and LoEj bits shall be ignored.
|
||||
|
||||
Signed-off-by: Ronnie Sahlberg <ronniesahlberg@gmail.com>
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
|
||||
From aa851d30acfbb9580098ac1dc82885530cb8b3c1 Mon Sep 17 00:00:00 2001
|
||||
From: Kevin Wolf <kwolf@redhat.com>
|
||||
Date: Wed, 3 Jun 2015 14:17:46 +0200
|
||||
Subject: [PATCH 2/3] ide/atapi: Fix START STOP UNIT command completion
|
||||
|
||||
The command must be completed on all code paths. START STOP UNIT with
|
||||
pwrcnd set should succeed without doing anything.
|
||||
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
---
|
||||
hw/ide/atapi.c | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
Index: xen-4.2.5-testing/tools/qemu-xen-traditional-dir-remote/hw/ide.c
|
||||
===================================================================
|
||||
--- xen-4.2.5-testing.orig/tools/qemu-xen-traditional-dir-remote/hw/ide.c
|
||||
+++ xen-4.2.5-testing/tools/qemu-xen-traditional-dir-remote/hw/ide.c
|
||||
@@ -2095,9 +2095,16 @@ static void ide_atapi_cmd(IDEState *s)
|
||||
break;
|
||||
case GPCMD_START_STOP_UNIT:
|
||||
{
|
||||
- int start, eject;
|
||||
+ int start, eject, pwrcnd;
|
||||
start = packet[4] & 1;
|
||||
eject = (packet[4] >> 1) & 1;
|
||||
+ pwrcnd = buf[4] & 0xf0;
|
||||
+
|
||||
+ if (pwrcnd) {
|
||||
+ /* eject/load only happens for power condition == 0 */
|
||||
+ ide_atapi_cmd_ok(s);
|
||||
+ return;
|
||||
+ }
|
||||
|
||||
if (eject && !start) {
|
||||
/* eject the disk */
|
@ -1,74 +0,0 @@
|
||||
From a9de14175548c04e0f8be7fae219246509ba46a9 Mon Sep 17 00:00:00 2001
|
||||
From: Kevin Wolf <kwolf@redhat.com>
|
||||
Date: Wed, 3 Jun 2015 14:13:31 +0200
|
||||
Subject: [PATCH 1/3] ide: Check array bounds before writing to io_buffer
|
||||
(CVE-2015-5154)
|
||||
|
||||
If the end_transfer_func of a command is called because enough data has
|
||||
been read or written for the current PIO transfer, and it fails to
|
||||
correctly call the command completion functions, the DRQ bit in the
|
||||
status register and s->end_transfer_func may remain set. This allows the
|
||||
guest to access further bytes in s->io_buffer beyond s->data_end, and
|
||||
eventually overflowing the io_buffer.
|
||||
|
||||
One case where this currently happens is emulation of the ATAPI command
|
||||
START STOP UNIT.
|
||||
|
||||
This patch fixes the problem by adding explicit array bounds checks
|
||||
before accessing the buffer instead of relying on end_transfer_func to
|
||||
function correctly.
|
||||
|
||||
Cc: qemu-stable@nongnu.org
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
---
|
||||
hw/ide/core.c | 16 ++++++++++++++++
|
||||
1 file changed, 16 insertions(+)
|
||||
|
||||
Index: xen-4.5.1-testing/tools/qemu-xen-dir-remote/hw/ide/core.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/qemu-xen-dir-remote/hw/ide/core.c
|
||||
+++ xen-4.5.1-testing/tools/qemu-xen-dir-remote/hw/ide/core.c
|
||||
@@ -1901,6 +1901,10 @@ void ide_data_writew(void *opaque, uint3
|
||||
}
|
||||
|
||||
p = s->data_ptr;
|
||||
+ if (p + 2 > s->data_end) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
*(uint16_t *)p = le16_to_cpu(val);
|
||||
p += 2;
|
||||
s->data_ptr = p;
|
||||
@@ -1922,6 +1926,10 @@ uint32_t ide_data_readw(void *opaque, ui
|
||||
}
|
||||
|
||||
p = s->data_ptr;
|
||||
+ if (p + 2 > s->data_end) {
|
||||
+ return 0;
|
||||
+ }
|
||||
+
|
||||
ret = cpu_to_le16(*(uint16_t *)p);
|
||||
p += 2;
|
||||
s->data_ptr = p;
|
||||
@@ -1943,6 +1951,10 @@ void ide_data_writel(void *opaque, uint3
|
||||
}
|
||||
|
||||
p = s->data_ptr;
|
||||
+ if (p + 4 > s->data_end) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
*(uint32_t *)p = le32_to_cpu(val);
|
||||
p += 4;
|
||||
s->data_ptr = p;
|
||||
@@ -1964,6 +1976,10 @@ uint32_t ide_data_readl(void *opaque, ui
|
||||
}
|
||||
|
||||
p = s->data_ptr;
|
||||
+ if (p + 4 > s->data_end) {
|
||||
+ return 0;
|
||||
+ }
|
||||
+
|
||||
ret = cpu_to_le32(*(uint32_t *)p);
|
||||
p += 4;
|
||||
s->data_ptr = p;
|
@ -1,68 +0,0 @@
|
||||
From 1d3c2268f8708126a34064c2e0c1000b40e6f3e5 Mon Sep 17 00:00:00 2001
|
||||
From: Kevin Wolf <kwolf@redhat.com>
|
||||
Date: Wed, 3 Jun 2015 14:41:27 +0200
|
||||
Subject: [PATCH 3/3] ide: Clear DRQ after handling all expected accesses
|
||||
|
||||
This is additional hardening against an end_transfer_func that fails to
|
||||
clear the DRQ status bit. The bit must be unset as soon as the PIO
|
||||
transfer has completed, so it's better to do this in a central place
|
||||
instead of duplicating the code in all commands (and forgetting it in
|
||||
some).
|
||||
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
---
|
||||
hw/ide/core.c | 16 ++++++++++++----
|
||||
1 file changed, 12 insertions(+), 4 deletions(-)
|
||||
|
||||
Index: xen-4.5.1-testing/tools/qemu-xen-dir-remote/hw/ide/core.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/qemu-xen-dir-remote/hw/ide/core.c
|
||||
+++ xen-4.5.1-testing/tools/qemu-xen-dir-remote/hw/ide/core.c
|
||||
@@ -1908,8 +1908,10 @@ void ide_data_writew(void *opaque, uint3
|
||||
*(uint16_t *)p = le16_to_cpu(val);
|
||||
p += 2;
|
||||
s->data_ptr = p;
|
||||
- if (p >= s->data_end)
|
||||
+ if (p >= s->data_end) {
|
||||
+ s->status &= ~DRQ_STAT;
|
||||
s->end_transfer_func(s);
|
||||
+ }
|
||||
}
|
||||
|
||||
uint32_t ide_data_readw(void *opaque, uint32_t addr)
|
||||
@@ -1933,8 +1935,10 @@ uint32_t ide_data_readw(void *opaque, ui
|
||||
ret = cpu_to_le16(*(uint16_t *)p);
|
||||
p += 2;
|
||||
s->data_ptr = p;
|
||||
- if (p >= s->data_end)
|
||||
+ if (p >= s->data_end) {
|
||||
+ s->status &= ~DRQ_STAT;
|
||||
s->end_transfer_func(s);
|
||||
+ }
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -1958,8 +1962,10 @@ void ide_data_writel(void *opaque, uint3
|
||||
*(uint32_t *)p = le32_to_cpu(val);
|
||||
p += 4;
|
||||
s->data_ptr = p;
|
||||
- if (p >= s->data_end)
|
||||
+ if (p >= s->data_end) {
|
||||
+ s->status &= ~DRQ_STAT;
|
||||
s->end_transfer_func(s);
|
||||
+ }
|
||||
}
|
||||
|
||||
uint32_t ide_data_readl(void *opaque, uint32_t addr)
|
||||
@@ -1983,8 +1989,10 @@ uint32_t ide_data_readl(void *opaque, ui
|
||||
ret = cpu_to_le32(*(uint32_t *)p);
|
||||
p += 4;
|
||||
s->data_ptr = p;
|
||||
- if (p >= s->data_end)
|
||||
+ if (p >= s->data_end) {
|
||||
+ s->status &= ~DRQ_STAT;
|
||||
s->end_transfer_func(s);
|
||||
+ }
|
||||
return ret;
|
||||
}
|
||||
|
@ -1,25 +0,0 @@
|
||||
From aa851d30acfbb9580098ac1dc82885530cb8b3c1 Mon Sep 17 00:00:00 2001
|
||||
From: Kevin Wolf <kwolf@redhat.com>
|
||||
Date: Wed, 3 Jun 2015 14:17:46 +0200
|
||||
Subject: [PATCH 2/3] ide/atapi: Fix START STOP UNIT command completion
|
||||
|
||||
The command must be completed on all code paths. START STOP UNIT with
|
||||
pwrcnd set should succeed without doing anything.
|
||||
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
---
|
||||
hw/ide/atapi.c | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
Index: xen-4.5.1-testing/tools/qemu-xen-dir-remote/hw/ide/atapi.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/qemu-xen-dir-remote/hw/ide/atapi.c
|
||||
+++ xen-4.5.1-testing/tools/qemu-xen-dir-remote/hw/ide/atapi.c
|
||||
@@ -879,6 +879,7 @@ static void cmd_start_stop_unit(IDEState
|
||||
|
||||
if (pwrcnd) {
|
||||
/* eject/load only happens for power condition == 0 */
|
||||
+ ide_atapi_cmd_ok(s);
|
||||
return;
|
||||
}
|
||||
|
@ -1,50 +0,0 @@
|
||||
References: bsc#944463
|
||||
|
||||
Subject: ui/vnc: limit client_cut_text msg payload size
|
||||
From: Peter Lieven pl@kamp.de Mon Jun 30 10:07:54 2014 +0200
|
||||
Date: Tue Jul 1 13:26:40 2014 +0200:
|
||||
Git: f9a70e79391f6d7c2a912d785239ee8effc1922d
|
||||
|
||||
currently a malicious client could define a payload
|
||||
size of 2^32 - 1 bytes and send up to that size of
|
||||
data to the vnc server. The server would allocated
|
||||
that amount of memory which could easily create an
|
||||
out of memory condition.
|
||||
|
||||
This patch limits the payload size to 1MB max.
|
||||
|
||||
Please note that client_cut_text messages are currently
|
||||
silently ignored.
|
||||
|
||||
Signed-off-by: Peter Lieven <pl@kamp.de>
|
||||
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
|
||||
|
||||
Index: xen-4.5.1-testing/tools/qemu-xen-traditional-dir-remote/vnc.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/qemu-xen-traditional-dir-remote/vnc.c
|
||||
+++ xen-4.5.1-testing/tools/qemu-xen-traditional-dir-remote/vnc.c
|
||||
@@ -1779,14 +1779,21 @@ static int protocol_client_msg(VncState
|
||||
pointer_event(vs, read_u8(data, 1), read_u16(data, 2), read_u16(data, 4));
|
||||
break;
|
||||
case 6:
|
||||
- if (len == 1)
|
||||
+ if (len == 1) {
|
||||
return 8;
|
||||
-
|
||||
+ }
|
||||
if (len == 8) {
|
||||
uint32_t v;
|
||||
v = read_u32(data, 4);
|
||||
- if (v)
|
||||
+ if (v > (1 << 20)) {
|
||||
+ VNC_DEBUG("vnc: client_cut_text msg payload has %u bytes"
|
||||
+ " which exceeds our limit of 1MB.", v);
|
||||
+ vnc_client_error(vs);
|
||||
+ break;
|
||||
+ }
|
||||
+ if (v > 0) {
|
||||
return 8 + v;
|
||||
+ }
|
||||
}
|
||||
|
||||
client_cut_text(vs, read_u32(data, 4), (char *)(data + 8));
|
@ -1,49 +0,0 @@
|
||||
References: bsc#944463
|
||||
|
||||
Subject: ui/vnc: limit client_cut_text msg payload size
|
||||
From: Peter Lieven pl@kamp.de Mon Jun 30 10:07:54 2014 +0200
|
||||
Date: Tue Jul 1 13:26:40 2014 +0200:
|
||||
Git: f9a70e79391f6d7c2a912d785239ee8effc1922d
|
||||
|
||||
currently a malicious client could define a payload
|
||||
size of 2^32 - 1 bytes and send up to that size of
|
||||
data to the vnc server. The server would allocated
|
||||
that amount of memory which could easily create an
|
||||
out of memory condition.
|
||||
|
||||
This patch limits the payload size to 1MB max.
|
||||
|
||||
Please note that client_cut_text messages are currently
|
||||
silently ignored.
|
||||
|
||||
Signed-off-by: Peter Lieven <pl@kamp.de>
|
||||
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
|
||||
|
||||
Index: xen-4.5.1-testing/tools/qemu-xen-dir-remote/ui/vnc.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/qemu-xen-dir-remote/ui/vnc.c
|
||||
+++ xen-4.5.1-testing/tools/qemu-xen-dir-remote/ui/vnc.c
|
||||
@@ -2149,13 +2149,20 @@ static int protocol_client_msg(VncState
|
||||
pointer_event(vs, read_u8(data, 1), read_u16(data, 2), read_u16(data, 4));
|
||||
break;
|
||||
case VNC_MSG_CLIENT_CUT_TEXT:
|
||||
- if (len == 1)
|
||||
+ if (len == 1) {
|
||||
return 8;
|
||||
-
|
||||
+ }
|
||||
if (len == 8) {
|
||||
uint32_t dlen = read_u32(data, 4);
|
||||
- if (dlen > 0)
|
||||
+ if (dlen > (1 << 20)) {
|
||||
+ error_report("vnc: client_cut_text msg payload has %u bytes"
|
||||
+ " which exceeds our limit of 1MB.", dlen);
|
||||
+ vnc_client_error(vs);
|
||||
+ break;
|
||||
+ }
|
||||
+ if (dlen > 0) {
|
||||
return 8 + dlen;
|
||||
+ }
|
||||
}
|
||||
|
||||
client_cut_text(vs, read_u32(data, 4), data + 8);
|
@ -1,31 +0,0 @@
|
||||
References: bsc#944697
|
||||
|
||||
From: P J P <address@hidden>
|
||||
|
||||
While processing transmit descriptors, it could lead to an infinite
|
||||
loop if 'bytes' was to become zero; Add a check to avoid it.
|
||||
|
||||
[The guest can force 'bytes' to 0 by setting the hdr_len and mss
|
||||
descriptor fields to 0.
|
||||
--Stefan]
|
||||
|
||||
Signed-off-by: P J P <address@hidden>
|
||||
Signed-off-by: Stefan Hajnoczi <address@hidden>
|
||||
---
|
||||
hw/net/e1000.c | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
Index: xen-4.5.1-testing/tools/qemu-xen-traditional-dir-remote/hw/e1000.c
|
||||
===================================================================
|
||||
--- xen-4.5.1-testing.orig/tools/qemu-xen-traditional-dir-remote/hw/e1000.c
|
||||
+++ xen-4.5.1-testing/tools/qemu-xen-traditional-dir-remote/hw/e1000.c
|
||||
@@ -470,7 +470,8 @@ process_tx_desc(E1000State *s, struct e1
|
||||
memmove(tp->data, tp->header, hdr);
|
||||
tp->size = hdr;
|
||||
}
|
||||
- } while (split_size -= bytes);
|
||||
+ split_size -= bytes;
|
||||
+ } while (bytes && split_size);
|
||||
} else if (!tp->tse && tp->cptse) {
|
||||
// context descriptor TSE is not set, while data descriptor TSE is set
|
||||
DBGOUT(TXERR, "TCP segmentaion Error\n");
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user