Accepting request 346076 from Virtualization

fate#315712: XEN: Use the PVOPS kernel (disable KMP building)

OBS-URL: https://build.opensuse.org/request/show/346076
OBS-URL: https://build.opensuse.org/package/show/openSUSE:Factory/xen?expand=0&rev=213
This commit is contained in:
Dominique Leuenberger 2015-12-18 20:53:23 +00:00 committed by Git OBS Bridge
commit 32bf53107e
187 changed files with 1294 additions and 14752 deletions

View File

@ -1,49 +0,0 @@
# Commit b7f74a19fe099e373ad52e4218c466f3e91b5f43
# Date 2015-01-23 15:05:48 +0100
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
grant-table: use uint16_t consistently for grant copy offset and length
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -1882,7 +1882,7 @@ static int
__acquire_grant_for_copy(
struct domain *rd, unsigned long gref, domid_t ldom, int readonly,
unsigned long *frame, struct page_info **page,
- unsigned *page_off, unsigned *length, unsigned allow_transitive)
+ uint16_t *page_off, uint16_t *length, unsigned allow_transitive)
{
struct grant_table *rgt = rd->grant_table;
grant_entry_v1_t *sha1;
@@ -1895,8 +1895,8 @@ __acquire_grant_for_copy(
grant_ref_t trans_gref;
struct domain *td;
unsigned long grant_frame;
- unsigned trans_page_off;
- unsigned trans_length;
+ uint16_t trans_page_off;
+ uint16_t trans_length;
int is_sub_page;
s16 rc = GNTST_okay;
@@ -2122,7 +2122,7 @@ __gnttab_copy(
if ( src_is_gref )
{
- unsigned source_off, source_len;
+ uint16_t source_off, source_len;
rc = __acquire_grant_for_copy(sd, op->source.u.ref,
current->domain->domain_id, 1,
&s_frame, &s_pg,
@@ -2147,7 +2147,7 @@ __gnttab_copy(
if ( dest_is_gref )
{
- unsigned dest_off, dest_len;
+ uint16_t dest_off, dest_len;
rc = __acquire_grant_for_copy(dd, op->dest.u.ref,
current->domain->domain_id, 0,
&d_frame, &d_pg, &dest_off, &dest_len, 1);

View File

@ -1,373 +0,0 @@
# Commit 3c72f8c2cf19f735d813081c836f03e3078ee5c1
# Date 2015-01-29 14:21:00 +0100
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
grant-table: refactor grant copy to reduce duplicate code
Much of the grant copy operation is identical for the source and
destination buffers. Refactor the code into per-buffer functions.
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Tim Deegan <tim@xen.org>
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -2077,139 +2077,230 @@ __acquire_grant_for_copy(
return rc;
}
-static void
-__gnttab_copy(
- struct gnttab_copy *op)
-{
- struct domain *sd = NULL, *dd = NULL;
- unsigned long s_frame, d_frame;
- struct page_info *s_pg = NULL, *d_pg = NULL;
- char *sp, *dp;
- s16 rc = GNTST_okay;
- int have_d_grant = 0, have_s_grant = 0;
- int src_is_gref, dest_is_gref;
-
- if ( ((op->source.offset + op->len) > PAGE_SIZE) ||
- ((op->dest.offset + op->len) > PAGE_SIZE) )
- PIN_FAIL(error_out, GNTST_bad_copy_arg, "copy beyond page area.\n");
+struct gnttab_copy_buf {
+ /* Guest provided. */
+ struct gnttab_copy_ptr ptr;
+ uint16_t len;
+
+ /* Mapped etc. */
+ struct domain *domain;
+ unsigned long frame;
+ struct page_info *page;
+ void *virt;
+ bool_t read_only;
+ bool_t have_grant;
+ bool_t have_type;
+};
- src_is_gref = op->flags & GNTCOPY_source_gref;
- dest_is_gref = op->flags & GNTCOPY_dest_gref;
+static int gnttab_copy_lock_domain(domid_t domid, unsigned int gref_flag,
+ struct gnttab_copy_buf *buf)
+{
+ int rc;
- if ( (op->source.domid != DOMID_SELF && !src_is_gref ) ||
- (op->dest.domid != DOMID_SELF && !dest_is_gref) )
- PIN_FAIL(error_out, GNTST_permission_denied,
+ if ( domid != DOMID_SELF && !gref_flag )
+ PIN_FAIL(out, GNTST_permission_denied,
"only allow copy-by-mfn for DOMID_SELF.\n");
- if ( op->source.domid == DOMID_SELF )
- sd = rcu_lock_current_domain();
- else if ( (sd = rcu_lock_domain_by_id(op->source.domid)) == NULL )
- PIN_FAIL(error_out, GNTST_bad_domain,
- "couldn't find %d\n", op->source.domid);
-
- if ( op->dest.domid == DOMID_SELF )
- dd = rcu_lock_current_domain();
- else if ( (dd = rcu_lock_domain_by_id(op->dest.domid)) == NULL )
- PIN_FAIL(error_out, GNTST_bad_domain,
- "couldn't find %d\n", op->dest.domid);
+ if ( domid == DOMID_SELF )
+ buf->domain = rcu_lock_current_domain();
+ else
+ {
+ buf->domain = rcu_lock_domain_by_id(domid);
+ if ( buf->domain == NULL )
+ PIN_FAIL(out, GNTST_bad_domain, "couldn't find %d\n", domid);
+ }
- rc = xsm_grant_copy(XSM_HOOK, sd, dd);
- if ( rc )
+ buf->ptr.domid = domid;
+ rc = GNTST_okay;
+ out:
+ return rc;
+}
+
+static void gnttab_copy_unlock_domains(struct gnttab_copy_buf *src,
+ struct gnttab_copy_buf *dest)
+{
+ if ( src->domain )
+ {
+ rcu_unlock_domain(src->domain);
+ src->domain = NULL;
+ }
+ if ( dest->domain )
+ {
+ rcu_unlock_domain(dest->domain);
+ dest->domain = NULL;
+ }
+}
+
+static int gnttab_copy_lock_domains(const struct gnttab_copy *op,
+ struct gnttab_copy_buf *src,
+ struct gnttab_copy_buf *dest)
+{
+ int rc;
+
+ rc = gnttab_copy_lock_domain(op->source.domid,
+ op->flags & GNTCOPY_source_gref, src);
+ if ( rc < 0 )
+ goto error;
+ rc = gnttab_copy_lock_domain(op->dest.domid,
+ op->flags & GNTCOPY_dest_gref, dest);
+ if ( rc < 0 )
+ goto error;
+
+ rc = xsm_grant_copy(XSM_HOOK, src->domain, dest->domain);
+ if ( rc < 0 )
{
rc = GNTST_permission_denied;
- goto error_out;
+ goto error;
}
+ return 0;
+
+ error:
+ gnttab_copy_unlock_domains(src, dest);
+ return rc;
+}
- if ( src_is_gref )
+static void gnttab_copy_release_buf(struct gnttab_copy_buf *buf)
+{
+ if ( buf->virt )
{
- uint16_t source_off, source_len;
- rc = __acquire_grant_for_copy(sd, op->source.u.ref,
- current->domain->domain_id, 1,
- &s_frame, &s_pg,
- &source_off, &source_len, 1);
- if ( rc != GNTST_okay )
- goto error_out;
- have_s_grant = 1;
- if ( op->source.offset < source_off ||
- op->len > source_len )
- PIN_FAIL(error_out, GNTST_general_error,
- "copy source out of bounds: %d < %d || %d > %d\n",
- op->source.offset, source_off,
- op->len, source_len);
+ unmap_domain_page(buf->virt);
+ buf->virt = NULL;
}
- else
+ if ( buf->have_type )
{
- rc = __get_paged_frame(op->source.u.gmfn, &s_frame, &s_pg, 1, sd);
- if ( rc != GNTST_okay )
- PIN_FAIL(error_out, rc,
- "source frame %lx invalid.\n", s_frame);
+ put_page_type(buf->page);
+ buf->have_type = 0;
+ }
+ if ( buf->page )
+ {
+ put_page(buf->page);
+ buf->page = NULL;
+ }
+ if ( buf->have_grant )
+ {
+ __release_grant_for_copy(buf->domain, buf->ptr.u.ref, buf->read_only);
+ buf->have_grant = 0;
}
+}
+
+static int gnttab_copy_claim_buf(const struct gnttab_copy *op,
+ const struct gnttab_copy_ptr *ptr,
+ struct gnttab_copy_buf *buf,
+ unsigned int gref_flag)
+{
+ int rc;
+
+ buf->read_only = gref_flag == GNTCOPY_source_gref;
- if ( dest_is_gref )
+ if ( op->flags & gref_flag )
{
- uint16_t dest_off, dest_len;
- rc = __acquire_grant_for_copy(dd, op->dest.u.ref,
- current->domain->domain_id, 0,
- &d_frame, &d_pg, &dest_off, &dest_len, 1);
+ rc = __acquire_grant_for_copy(buf->domain, ptr->u.ref,
+ current->domain->domain_id,
+ buf->read_only,
+ &buf->frame, &buf->page,
+ &buf->ptr.offset, &buf->len, 1);
if ( rc != GNTST_okay )
- goto error_out;
- have_d_grant = 1;
- if ( op->dest.offset < dest_off ||
- op->len > dest_len )
- PIN_FAIL(error_out, GNTST_general_error,
- "copy dest out of bounds: %d < %d || %d > %d\n",
- op->dest.offset, dest_off,
- op->len, dest_len);
+ goto out;
+ buf->ptr.u.ref = ptr->u.ref;
+ buf->have_grant = 1;
}
else
{
- rc = __get_paged_frame(op->dest.u.gmfn, &d_frame, &d_pg, 0, dd);
+ rc = __get_paged_frame(ptr->u.gmfn, &buf->frame, &buf->page,
+ buf->read_only, buf->domain);
if ( rc != GNTST_okay )
- PIN_FAIL(error_out, rc,
- "destination frame %lx invalid.\n", d_frame);
+ PIN_FAIL(out, rc,
+ "source frame %lx invalid.\n", ptr->u.gmfn);
+
+ buf->ptr.u.gmfn = ptr->u.gmfn;
+ buf->ptr.offset = 0;
+ buf->len = PAGE_SIZE;
}
- if ( !get_page_type(d_pg, PGT_writable_page) )
+ if ( !buf->read_only )
{
- if ( !dd->is_dying )
- gdprintk(XENLOG_WARNING, "Could not get dst frame %lx\n", d_frame);
- rc = GNTST_general_error;
- goto error_out;
- }
-
- sp = map_domain_page(s_frame);
- dp = map_domain_page(d_frame);
-
- memcpy(dp + op->dest.offset, sp + op->source.offset, op->len);
-
- unmap_domain_page(dp);
- unmap_domain_page(sp);
-
- gnttab_mark_dirty(dd, d_frame);
-
- put_page_type(d_pg);
- error_out:
- if ( d_pg )
- put_page(d_pg);
- if ( s_pg )
- put_page(s_pg);
- if ( have_s_grant )
- __release_grant_for_copy(sd, op->source.u.ref, 1);
- if ( have_d_grant )
- __release_grant_for_copy(dd, op->dest.u.ref, 0);
- if ( sd )
- rcu_unlock_domain(sd);
- if ( dd )
- rcu_unlock_domain(dd);
- op->status = rc;
+ if ( !get_page_type(buf->page, PGT_writable_page) )
+ {
+ if ( !buf->domain->is_dying )
+ gdprintk(XENLOG_WARNING, "Could not get writable frame %lx\n", buf->frame);
+ rc = GNTST_general_error;
+ goto out;
+ }
+ buf->have_type = 1;
+ }
+
+ buf->virt = map_domain_page(buf->frame);
+ rc = GNTST_okay;
+
+ out:
+ return rc;
}
-static long
-gnttab_copy(
+static int gnttab_copy_buf(const struct gnttab_copy *op,
+ struct gnttab_copy_buf *dest,
+ const struct gnttab_copy_buf *src)
+{
+ int rc;
+
+ if ( ((op->source.offset + op->len) > PAGE_SIZE) ||
+ ((op->dest.offset + op->len) > PAGE_SIZE) )
+ PIN_FAIL(out, GNTST_bad_copy_arg, "copy beyond page area.\n");
+
+ if ( op->source.offset < src->ptr.offset ||
+ op->source.offset + op->len > src->ptr.offset + src->len )
+ PIN_FAIL(out, GNTST_general_error,
+ "copy source out of bounds: %d < %d || %d > %d\n",
+ op->source.offset, src->ptr.offset,
+ op->len, src->len);
+
+ if ( op->dest.offset < dest->ptr.offset ||
+ op->dest.offset + op->len > dest->ptr.offset + dest->len )
+ PIN_FAIL(out, GNTST_general_error,
+ "copy dest out of bounds: %d < %d || %d > %d\n",
+ op->dest.offset, dest->ptr.offset,
+ op->len, dest->len);
+
+ memcpy(dest->virt + op->dest.offset, src->virt + op->source.offset,
+ op->len);
+ gnttab_mark_dirty(dest->domain, dest->frame);
+ rc = GNTST_okay;
+ out:
+ return rc;
+}
+
+static int gnttab_copy_one(const struct gnttab_copy *op,
+ struct gnttab_copy_buf *dest,
+ struct gnttab_copy_buf *src)
+{
+ int rc;
+
+ rc = gnttab_copy_lock_domains(op, src, dest);
+ if ( rc < 0 )
+ goto out;
+
+ rc = gnttab_copy_claim_buf(op, &op->source, src, GNTCOPY_source_gref);
+ if ( rc < 0 )
+ goto out;
+
+ rc = gnttab_copy_claim_buf(op, &op->dest, dest, GNTCOPY_dest_gref);
+ if ( rc < 0 )
+ goto out;
+
+ rc = gnttab_copy_buf(op, dest, src);
+ out:
+ gnttab_copy_release_buf(src);
+ gnttab_copy_release_buf(dest);
+ gnttab_copy_unlock_domains(src, dest);
+ return rc;
+}
+
+static long gnttab_copy(
XEN_GUEST_HANDLE_PARAM(gnttab_copy_t) uop, unsigned int count)
{
- int i;
+ unsigned int i;
struct gnttab_copy op;
+ struct gnttab_copy_buf src = {};
+ struct gnttab_copy_buf dest = {};
for ( i = 0; i < count; i++ )
{
@@ -2217,7 +2308,9 @@ gnttab_copy(
return i;
if ( unlikely(__copy_from_guest(&op, uop, 1)) )
return -EFAULT;
- __gnttab_copy(&op);
+
+ op.status = gnttab_copy_one(&op, &dest, &src);
+
if ( unlikely(__copy_field_to_guest(uop, &op, status)) )
return -EFAULT;
guest_handle_add_offset(uop, 1);
--- a/xen/include/public/grant_table.h
+++ b/xen/include/public/grant_table.h
@@ -453,7 +453,7 @@ DEFINE_XEN_GUEST_HANDLE(gnttab_transfer_
struct gnttab_copy {
/* IN parameters. */
- struct {
+ struct gnttab_copy_ptr {
union {
grant_ref_t ref;
xen_pfn_t gmfn;

View File

@ -1,155 +0,0 @@
# Commit d28f42f2703e483116bafd2b0b76a32af67d83ad
# Date 2015-01-29 14:22:22 +0100
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
grant-table: defer releasing pages acquired in a grant copy
Acquiring a page for the source or destination of a grant copy is an
expensive operation. A common use case is for two adjacent grant copy
ops to operate on either the same source or the same destination page.
Instead of always acquiring and releasing destination and source pages
for each operation, release the page once it is no longer valid for
the next op.
If either the source or destination domains changes both pages are
released as it is unlikely that either will still be valid.
XenServer's performance benchmarks show modest improvements in network
receive throughput (netback uses grant copy in the guest Rx path) and
no regressions in disk performance (using tapdisk3 which grant copies
as the backend).
Baseline Deferred Release
Interhost receive to VM 7.2 Gb/s ~9 Gbit/s
Interhost aggregate 24 Gb/s 28 Gb/s
Intrahost single stream 14 Gb/s 14 Gb/s
Intrahost aggregate 34 Gb/s 36 Gb/s
Aggregate disk write 900 MB/s 900 MB/s
Aggregate disk read 890 MB/s 890 MB/s
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Tim Deegan <tim@xen.org>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -2236,6 +2236,17 @@ static int gnttab_copy_claim_buf(const s
return rc;
}
+static bool_t gnttab_copy_buf_valid(const struct gnttab_copy_ptr *p,
+ const struct gnttab_copy_buf *b,
+ bool_t has_gref)
+{
+ if ( !b->virt )
+ return 0;
+ if ( has_gref )
+ return b->have_grant && p->u.ref == b->ptr.u.ref;
+ return p->u.gmfn == b->ptr.u.gmfn;
+}
+
static int gnttab_copy_buf(const struct gnttab_copy *op,
struct gnttab_copy_buf *dest,
const struct gnttab_copy_buf *src)
@@ -2274,23 +2285,40 @@ static int gnttab_copy_one(const struct
{
int rc;
- rc = gnttab_copy_lock_domains(op, src, dest);
- if ( rc < 0 )
- goto out;
+ if ( !src->domain || op->source.domid != src->ptr.domid ||
+ !dest->domain || op->dest.domid != dest->ptr.domid )
+ {
+ gnttab_copy_release_buf(src);
+ gnttab_copy_release_buf(dest);
+ gnttab_copy_unlock_domains(src, dest);
- rc = gnttab_copy_claim_buf(op, &op->source, src, GNTCOPY_source_gref);
- if ( rc < 0 )
- goto out;
+ rc = gnttab_copy_lock_domains(op, src, dest);
+ if ( rc < 0 )
+ goto out;
+ }
- rc = gnttab_copy_claim_buf(op, &op->dest, dest, GNTCOPY_dest_gref);
- if ( rc < 0 )
- goto out;
+ /* Different source? */
+ if ( !gnttab_copy_buf_valid(&op->source, src,
+ op->flags & GNTCOPY_source_gref) )
+ {
+ gnttab_copy_release_buf(src);
+ rc = gnttab_copy_claim_buf(op, &op->source, src, GNTCOPY_source_gref);
+ if ( rc < 0 )
+ goto out;
+ }
+
+ /* Different dest? */
+ if ( !gnttab_copy_buf_valid(&op->dest, dest,
+ op->flags & GNTCOPY_dest_gref) )
+ {
+ gnttab_copy_release_buf(dest);
+ rc = gnttab_copy_claim_buf(op, &op->dest, dest, GNTCOPY_dest_gref);
+ if ( rc < 0 )
+ goto out;
+ }
rc = gnttab_copy_buf(op, dest, src);
out:
- gnttab_copy_release_buf(src);
- gnttab_copy_release_buf(dest);
- gnttab_copy_unlock_domains(src, dest);
return rc;
}
@@ -2301,21 +2329,42 @@ static long gnttab_copy(
struct gnttab_copy op;
struct gnttab_copy_buf src = {};
struct gnttab_copy_buf dest = {};
+ long rc = 0;
for ( i = 0; i < count; i++ )
{
- if (i && hypercall_preempt_check())
- return i;
+ if ( i && hypercall_preempt_check() )
+ {
+ rc = i;
+ break;
+ }
+
if ( unlikely(__copy_from_guest(&op, uop, 1)) )
- return -EFAULT;
+ {
+ rc = -EFAULT;
+ break;
+ }
op.status = gnttab_copy_one(&op, &dest, &src);
+ if ( op.status != GNTST_okay )
+ {
+ gnttab_copy_release_buf(&src);
+ gnttab_copy_release_buf(&dest);
+ }
if ( unlikely(__copy_field_to_guest(uop, &op, status)) )
- return -EFAULT;
+ {
+ rc = -EFAULT;
+ break;
+ }
guest_handle_add_offset(uop, 1);
}
- return 0;
+
+ gnttab_copy_release_buf(&src);
+ gnttab_copy_release_buf(&dest);
+ gnttab_copy_unlock_domains(&src, &dest);
+
+ return rc;
}
static long

View File

@ -1,90 +0,0 @@
References: bsc#949138
Subject: libxl: make some _dispose functions idempotent and tolerate NULL
From: Wei Liu wei.liu2@citrix.com Wed Feb 25 14:56:02 2015 +0000
Date: Mon Mar 2 17:05:35 2015 +0000:
Git: 1ea68f1a82ef94b3cc644fa70307c5151f356baf
These functions are not generated, so we need to do it by hand.
Functions list:
libxl_bitmap_dispose
libxl_string_list_dispose
libxl_key_value_list_dipose
libxl_cpuid_dispose
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
Cc: Ian Jackson <ian.jackson@eu.citrix.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Index: xen-4.5.1-testing/tools/libxl/libxl.c
===================================================================
--- xen-4.5.1-testing.orig/tools/libxl/libxl.c
+++ xen-4.5.1-testing/tools/libxl/libxl.c
@@ -211,9 +211,12 @@ void libxl_string_list_dispose(libxl_str
if (!sl)
return;
- for (i = 0; sl[i] != NULL; i++)
+ for (i = 0; sl[i] != NULL; i++) {
free(sl[i]);
+ sl[i] = NULL;
+ }
free(sl);
+ *psl = NULL;
}
void libxl_string_list_copy(libxl_ctx *ctx,
@@ -273,10 +276,14 @@ void libxl_key_value_list_dispose(libxl_
for (i = 0; kvl[i] != NULL; i += 2) {
free(kvl[i]);
- if (kvl[i + 1])
+ kvl[i] = NULL;
+ if (kvl[i + 1]) {
free(kvl[i + 1]);
+ kvl[i+1] = NULL;
+ }
}
free(kvl);
+ *pkvl = NULL;
}
void libxl_key_value_list_copy(libxl_ctx *ctx,
Index: xen-4.5.1-testing/tools/libxl/libxl_cpuid.c
===================================================================
--- xen-4.5.1-testing.orig/tools/libxl/libxl_cpuid.c
+++ xen-4.5.1-testing/tools/libxl/libxl_cpuid.c
@@ -28,10 +28,13 @@ void libxl_cpuid_dispose(libxl_cpuid_pol
return;
for (i = 0; cpuid_list[i].input[0] != XEN_CPUID_INPUT_UNUSED; i++) {
for (j = 0; j < 4; j++)
- if (cpuid_list[i].policy[j] != NULL)
+ if (cpuid_list[i].policy[j] != NULL) {
free(cpuid_list[i].policy[j]);
+ cpuid_list[i].policy[j] = NULL;
+ }
}
free(cpuid_list);
+ *p_cpuid_list = NULL;
return;
}
Index: xen-4.5.1-testing/tools/libxl/libxl_utils.c
===================================================================
--- xen-4.5.1-testing.orig/tools/libxl/libxl_utils.c
+++ xen-4.5.1-testing/tools/libxl/libxl_utils.c
@@ -604,7 +604,12 @@ void libxl_bitmap_init(libxl_bitmap *map
void libxl_bitmap_dispose(libxl_bitmap *map)
{
+ if (!map)
+ return;
+
free(map->map);
+ map->map = NULL;
+ map->size = 0;
}
void libxl_bitmap_copy(libxl_ctx *ctx, libxl_bitmap *dptr,

View File

@ -1,113 +0,0 @@
# Commit 88a2372c6ba44dd42b915a95a823cf9d4d260e25
# Date 2015-03-23 16:49:42 +0100
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
vm-assist: prepare for discontiguous used bit numbers
Since the a flag will get assigned a value discontiguous to the
existing ones (in order to preserve the low bits, as only those are
currently accessible to 32-bit guests), this requires a little bit of
rework of the VM assist code in general: An architecture specific
VM_ASSIST_VALID definition gets introduced (with an optional compat
mode counterpart), and compilation of the respective code becomes
conditional upon this being defined (ARM doesn't wire these up and
hence doesn't need that code).
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Tim Deegan <tim@xen.org>
--- a/xen/common/compat/kernel.c
+++ b/xen/common/compat/kernel.c
@@ -41,6 +41,11 @@ CHECK_TYPE(domain_handle);
#define xennmi_callback compat_nmi_callback
#define xennmi_callback_t compat_nmi_callback_t
+#ifdef COMPAT_VM_ASSIST_VALID
+#undef VM_ASSIST_VALID
+#define VM_ASSIST_VALID COMPAT_VM_ASSIST_VALID
+#endif
+
#define DO(fn) int compat_##fn
#define COMPAT
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -1325,9 +1325,11 @@ long do_vcpu_op(int cmd, int vcpuid, XEN
return rc;
}
-long vm_assist(struct domain *p, unsigned int cmd, unsigned int type)
+#ifdef VM_ASSIST_VALID
+long vm_assist(struct domain *p, unsigned int cmd, unsigned int type,
+ unsigned long valid)
{
- if ( type > MAX_VMASST_TYPE )
+ if ( type >= BITS_PER_LONG || !test_bit(type, &valid) )
return -EINVAL;
switch ( cmd )
@@ -1342,6 +1344,7 @@ long vm_assist(struct domain *p, unsigne
return -ENOSYS;
}
+#endif
struct pirq *pirq_get_info(struct domain *d, int pirq)
{
--- a/xen/common/kernel.c
+++ b/xen/common/kernel.c
@@ -396,10 +396,12 @@ DO(nmi_op)(unsigned int cmd, XEN_GUEST_H
return rc;
}
+#ifdef VM_ASSIST_VALID
DO(vm_assist)(unsigned int cmd, unsigned int type)
{
- return vm_assist(current->domain, cmd, type);
+ return vm_assist(current->domain, cmd, type, VM_ASSIST_VALID);
}
+#endif
DO(ni_hypercall)(void)
{
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -327,6 +327,14 @@ extern unsigned long xen_phys_start;
#define ARG_XLAT_START(v) \
(ARG_XLAT_VIRT_START + ((v)->vcpu_id << ARG_XLAT_VA_SHIFT))
+#define NATIVE_VM_ASSIST_VALID ((1UL << VMASST_TYPE_4gb_segments) | \
+ (1UL << VMASST_TYPE_4gb_segments_notify) | \
+ (1UL << VMASST_TYPE_writable_pagetables) | \
+ (1UL << VMASST_TYPE_pae_extended_cr3))
+#define VM_ASSIST_VALID NATIVE_VM_ASSIST_VALID
+#define COMPAT_VM_ASSIST_VALID (NATIVE_VM_ASSIST_VALID & \
+ ((1UL << COMPAT_BITS_PER_LONG) - 1))
+
#define ELFSIZE 64
#define ARCH_CRASH_SAVE_VMCOREINFO
--- a/xen/include/public/xen.h
+++ b/xen/include/public/xen.h
@@ -486,7 +486,9 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
/* x86/PAE guests: support PDPTs above 4GB. */
#define VMASST_TYPE_pae_extended_cr3 3
+#if __XEN_INTERFACE_VERSION__ < 0x00040600
#define MAX_VMASST_TYPE 3
+#endif
#ifndef __ASSEMBLY__
--- a/xen/include/xen/lib.h
+++ b/xen/include/xen/lib.h
@@ -92,7 +92,8 @@ extern void guest_printk(const struct do
__attribute__ ((format (printf, 2, 3)));
extern void noreturn panic(const char *format, ...)
__attribute__ ((format (printf, 1, 2)));
-extern long vm_assist(struct domain *, unsigned int, unsigned int);
+extern long vm_assist(struct domain *, unsigned int cmd, unsigned int type,
+ unsigned long valid);
extern int __printk_ratelimit(int ratelimit_ms, int ratelimit_burst);
extern int printk_ratelimit(void);

View File

@ -1,609 +0,0 @@
Index: xen-4.5.1-testing/tools/libxl/libxl_dm.c
===================================================================
--- xen-4.5.1-testing.orig/tools/libxl/libxl_dm.c
+++ xen-4.5.1-testing/tools/libxl/libxl_dm.c
@@ -445,6 +445,15 @@ static char ** libxl__build_device_model
flexarray_append(dm_args, "-mon");
flexarray_append(dm_args, "chardev=libxl-cmd,mode=control");
+ flexarray_append(dm_args, "-chardev");
+ flexarray_append(dm_args,
+ libxl__sprintf(gc, "socket,id=libxenstat-cmd,"
+ "path=%s/qmp-libxenstat-%d,server,nowait",
+ libxl__run_dir_path(), guest_domid));
+
+ flexarray_append(dm_args, "-mon");
+ flexarray_append(dm_args, "chardev=libxenstat-cmd,mode=control");
+
for (i = 0; i < guest_config->num_channels; i++) {
connection = guest_config->channels[i].connection;
devid = guest_config->channels[i].devid;
Index: xen-4.5.1-testing/tools/libxl/libxl_qmp.c
===================================================================
--- xen-4.5.1-testing.orig/tools/libxl/libxl_qmp.c
+++ xen-4.5.1-testing/tools/libxl/libxl_qmp.c
@@ -723,6 +723,13 @@ void libxl__qmp_cleanup(libxl__gc *gc, u
LOGE(ERROR, "Failed to remove QMP socket file %s", qmp_socket);
}
}
+
+ qmp_socket = GCSPRINTF("%s/qmp-libxenstat-%d", libxl__run_dir_path(), domid);
+ if (unlink(qmp_socket) == -1) {
+ if (errno != ENOENT) {
+ LOGE(ERROR, "Failed to remove QMP socket file %s", qmp_socket);
+ }
+ }
}
int libxl__qmp_query_serial(libxl__qmp_handler *qmp)
Index: xen-4.5.1-testing/tools/xenstat/libxenstat/Makefile
===================================================================
--- xen-4.5.1-testing.orig/tools/xenstat/libxenstat/Makefile
+++ xen-4.5.1-testing/tools/xenstat/libxenstat/Makefile
@@ -24,7 +24,7 @@ MINOR=0
LIB=src/libxenstat.a
SHLIB=src/libxenstat.so.$(MAJOR).$(MINOR)
SHLIB_LINKS=src/libxenstat.so.$(MAJOR) src/libxenstat.so
-OBJECTS-y=src/xenstat.o
+OBJECTS-y=src/xenstat.o src/xenstat_qmp.o
OBJECTS-$(CONFIG_Linux) += src/xenstat_linux.o
OBJECTS-$(CONFIG_SunOS) += src/xenstat_solaris.o
OBJECTS-$(CONFIG_NetBSD) += src/xenstat_netbsd.o
@@ -32,7 +32,7 @@ OBJECTS-$(CONFIG_FreeBSD) += src/xenstat
SONAME_FLAGS=-Wl,$(SONAME_LDFLAG) -Wl,libxenstat.so.$(MAJOR)
CFLAGS+=-fPIC
-CFLAGS+=-Isrc $(CFLAGS_libxenctrl) $(CFLAGS_libxenstore) $(CFLAGS_xeninclude)
+CFLAGS+=-Isrc $(CFLAGS_libxenctrl) $(CFLAGS_libxenstore) $(CFLAGS_xeninclude) -include $(XEN_ROOT)/tools/config.h
LDLIBS-y = $(LDLIBS_libxenstore) $(LDLIBS_libxenctrl)
LDLIBS-$(CONFIG_SunOS) += -lkstat
Index: xen-4.5.1-testing/tools/xenstat/xentop/Makefile
===================================================================
--- xen-4.5.1-testing.orig/tools/xenstat/xentop/Makefile
+++ xen-4.5.1-testing/tools/xenstat/xentop/Makefile
@@ -19,7 +19,7 @@ all install xentop:
else
CFLAGS += -DGCC_PRINTF -Werror $(CFLAGS_libxenstat)
-LDLIBS += $(LDLIBS_libxenstat) $(CURSES_LIBS) $(SOCKET_LIBS) -lm
+LDLIBS += $(LDLIBS_libxenstat) $(CURSES_LIBS) $(SOCKET_LIBS) -lm -lyajl
CFLAGS += -DHOST_$(XEN_OS)
# Include configure output (config.h) to headers search path
Index: xen-4.5.1-testing/tools/xenstat/libxenstat/src/xenstat_priv.h
===================================================================
--- xen-4.5.1-testing.orig/tools/xenstat/libxenstat/src/xenstat_priv.h
+++ xen-4.5.1-testing/tools/xenstat/libxenstat/src/xenstat_priv.h
@@ -109,5 +109,7 @@ extern int xenstat_collect_networks(xens
extern void xenstat_uninit_networks(xenstat_handle * handle);
extern int xenstat_collect_vbds(xenstat_node * node);
extern void xenstat_uninit_vbds(xenstat_handle * handle);
+extern void read_attributes_qdisk(xenstat_node * node);
+extern xenstat_vbd *xenstat_save_vbd(xenstat_domain * domain, xenstat_vbd * vbd);
#endif /* XENSTAT_PRIV_H */
Index: xen-4.5.1-testing/tools/xenstat/libxenstat/src/xenstat.c
===================================================================
--- xen-4.5.1-testing.orig/tools/xenstat/libxenstat/src/xenstat.c
+++ xen-4.5.1-testing/tools/xenstat/libxenstat/src/xenstat.c
@@ -657,6 +657,27 @@ static void xenstat_uninit_xen_version(x
* VBD functions
*/
+/* Save VBD information */
+xenstat_vbd *xenstat_save_vbd(xenstat_domain *domain, xenstat_vbd *vbd)
+{
+ xenstat_vbd *vbds = domain->vbds;
+
+ domain->num_vbds++;
+ domain->vbds = realloc(domain->vbds,
+ domain->num_vbds *
+ sizeof(xenstat_vbd));
+
+ if (domain->vbds == NULL) {
+ domain->num_vbds = 0;
+ free(vbds);
+ }
+ else {
+ domain->vbds[domain->num_vbds - 1] = *vbd;
+ }
+
+ return domain->vbds;
+}
+
/* Free VBD information */
static void xenstat_free_vbds(xenstat_node * node)
{
Index: xen-4.5.1-testing/tools/xenstat/libxenstat/src/xenstat_linux.c
===================================================================
--- xen-4.5.1-testing.orig/tools/xenstat/libxenstat/src/xenstat_linux.c
+++ xen-4.5.1-testing/tools/xenstat/libxenstat/src/xenstat_linux.c
@@ -417,6 +417,9 @@ int xenstat_collect_vbds(xenstat_node *
}
}
+ /* Get qdisk statistics */
+ read_attributes_qdisk(node);
+
rewinddir(priv->sysfsvbd);
for(dp = readdir(priv->sysfsvbd); dp != NULL ;
@@ -477,18 +480,10 @@ int xenstat_collect_vbds(xenstat_node *
continue;
}
- if (domain->vbds == NULL) {
- domain->num_vbds = 1;
- domain->vbds = malloc(sizeof(xenstat_vbd));
- } else {
- domain->num_vbds++;
- domain->vbds = realloc(domain->vbds,
- domain->num_vbds *
- sizeof(xenstat_vbd));
- }
- if (domain->vbds == NULL)
+ if ((xenstat_save_vbd(domain, &vbd)) == NULL) {
+ perror("Allocation error");
return 0;
- domain->vbds[domain->num_vbds - 1] = vbd;
+ }
}
return 1;
Index: xen-4.5.1-testing/tools/xenstat/libxenstat/src/xenstat_qmp.c
===================================================================
--- /dev/null
+++ xen-4.5.1-testing/tools/xenstat/libxenstat/src/xenstat_qmp.c
@@ -0,0 +1,451 @@
+/* libxenstat: statistics-collection library for Xen
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ */
+
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/poll.h>
+#include <sys/un.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <xenctrl.h>
+
+#include "xenstat_priv.h"
+
+#ifdef HAVE_YAJL_YAJL_VERSION_H
+# include <yajl/yajl_version.h>
+#endif
+
+/* YAJL version check */
+#if defined(YAJL_MAJOR) && (YAJL_MAJOR > 1)
+# define HAVE_YAJL_V2 1
+#endif
+
+#ifdef HAVE_YAJL_V2
+
+#include <yajl/yajl_tree.h>
+
+static unsigned char *qmp_query(int, char *);
+
+enum query_blockstats {
+ QMP_STATS_RETURN = 0,
+ QMP_STATS_DEVICE = 1,
+ QMP_STATS = 2,
+ QMP_RD_BYTES = 3,
+ QMP_WR_BYTES = 4,
+ QMP_RD_OPERATIONS = 5,
+ QMP_WR_OPERATIONS = 6,
+};
+
+enum query_block {
+ QMP_BLOCK_RETURN = 0,
+ QMP_BLOCK_DEVICE = 1,
+ QMP_INSERTED = 2,
+ QMP_FILE = 3,
+};
+
+
+/* Given the qmp device name, get the image filename associated with it
+ QMP Syntax for querying block infomation:
+ In: { "execute": "query-block" }
+ Out: {"return": [{
+ "device": 'str, "locked": 'bool', "removable": bool,
+ "inserted": {
+ "iops_rd": 'int',
+ "image": {
+ "virtual-size": 'int', "filename": 'str', "cluster-size": 'int',
+ "format": 'str', "actual-size": 'int', "dirty-flag": 'bool'
+ },
+ "iops_wr": 'int', "ro": 'bool', "backing_file_depth": 'int',
+ "drv": 'str', "iops": 'int', "bps_wr": 'int', "encrypted": 'bool',
+ "bps": 'int', "bps_rd": 'int',
+ "file": 'str', "encryption_key_missing": 'bool'
+ },
+ "type": 'str'
+ }]}
+*/
+static char *qmp_get_block_image(xenstat_node *node, char *qmp_devname, int qfd)
+{
+ char *tmp, *file = NULL;
+ char *query_block_cmd = "{ \"execute\": \"query-block\" }";
+ static const char *const qblock[] = {
+ [ QMP_BLOCK_RETURN ] = "return",
+ [ QMP_BLOCK_DEVICE ] = "device",
+ [ QMP_INSERTED ] = "inserted",
+ [ QMP_FILE ] = "file",
+ };
+ const char *ptr[] = {0, 0};
+ unsigned char *qmp_stats;
+ yajl_val info, ret_obj, dev_obj, n;
+ int i;
+
+ if ((qmp_stats = qmp_query(qfd, query_block_cmd)) == NULL)
+ return NULL;
+
+ /* Use libyajl version 2.0.3 or newer for the tree parser feature with bug fixes */
+ if ((info = yajl_tree_parse((char *)qmp_stats, NULL, 0)) == NULL) {
+ free(qmp_stats);
+ return NULL;
+ }
+
+ ptr[0] = qblock[QMP_BLOCK_RETURN]; /* "return" */
+ if ((ret_obj = yajl_tree_get(info, ptr, yajl_t_array)) == NULL)
+ goto done;
+
+ for (i=0; i<YAJL_GET_ARRAY(ret_obj)->len; i++) {
+ n = YAJL_GET_ARRAY(ret_obj)->values[i];
+
+ ptr[0] = qblock[QMP_BLOCK_DEVICE]; /* "device" */
+ if ((dev_obj = yajl_tree_get(n, ptr, yajl_t_any)) != NULL) {
+ tmp = YAJL_GET_STRING(dev_obj);
+ if (strcmp(qmp_devname, tmp))
+ continue;
+ }
+ else
+ continue;
+
+ ptr[0] = qblock[QMP_INSERTED]; /* "inserted" */
+ n = yajl_tree_get(n, ptr, yajl_t_any);
+ if (n) {
+ ptr[0] = qblock[QMP_FILE]; /* "file" */
+ n = yajl_tree_get(n, ptr, yajl_t_any);
+ if (n && YAJL_IS_STRING(n)) {
+ tmp = YAJL_GET_STRING(n);
+ file = malloc(strlen(tmp)+1);
+ if (file != NULL)
+ strcpy(file, tmp);
+ goto done;
+ }
+ }
+ }
+done:
+ yajl_tree_free(info);
+ return file;
+}
+
+
+/* Given a QMP device name, lookup the associated xenstore qdisk device id */
+static void lookup_xenstore_devid(xenstat_node * node, unsigned int domid, char *qmp_devname,
+ int qfd, unsigned int *dev, unsigned int *sector_size)
+{
+ char **dev_ids, *tmp, *ptr, *image, path[80];
+ unsigned int num_dev_ids;
+ int i, devid;
+
+ /* Get all the qdisk dev IDs associated with the this VM */
+ snprintf(path, sizeof(path),"/local/domain/0/backend/qdisk/%i", domid);
+ dev_ids = xs_directory(node->handle->xshandle, XBT_NULL, path, &num_dev_ids);
+ if (dev_ids == NULL) {
+ return;
+ }
+
+ /* Get the filename of the image associated with this QMP device */
+ image = qmp_get_block_image(node, qmp_devname, qfd);
+ if (image == NULL) {
+ free(dev_ids);
+ return;
+ }
+
+ /* Look for a matching image in xenstore */
+ for (i=0; i<num_dev_ids; i++) {
+ devid = atoi(dev_ids[i]);
+ /* Get the xenstore name of the image */
+ snprintf(path, sizeof(path),"/local/domain/0/backend/qdisk/%i/%i/params", domid, devid);
+ if ((ptr = xs_read(node->handle->xshandle, XBT_NULL, path, NULL)) == NULL)
+ continue;
+
+ /* Get to actual path in string */
+ if ((tmp = strchr(ptr, '/')) == NULL)
+ tmp = ptr;
+ if (!strcmp(tmp,image)) {
+ *dev = devid;
+ free(ptr);
+
+ /* Get the xenstore sector size of the image while we're here */
+ snprintf(path, sizeof(path),"/local/domain/0/backend/qdisk/%i/%i/sector-size", domid, devid);
+ if ((ptr = xs_read(node->handle->xshandle, XBT_NULL, path, NULL)) != NULL) {
+ *sector_size = atoi((char *)ptr);
+ free(ptr);
+ }
+ break;
+ }
+ free(ptr);
+ }
+
+ free(image);
+ free(dev_ids);
+}
+
+/* Parse the stats buffer which contains I/O data for all the disks belonging to domid */
+static void qmp_parse_stats(xenstat_node *node, unsigned int domid, unsigned char *stats_buf, int qfd)
+{
+ char *qmp_devname;
+ static const char *const qstats[] = {
+ [ QMP_STATS_RETURN ] = "return",
+ [ QMP_STATS_DEVICE ] = "device",
+ [ QMP_STATS ] = "stats",
+ [ QMP_RD_BYTES ] = "rd_bytes",
+ [ QMP_WR_BYTES ] = "wr_bytes",
+ [ QMP_RD_OPERATIONS ] = "rd_operations",
+ [ QMP_WR_OPERATIONS ] = "wr_operations",
+ };
+ const char *ptr[] = {0, 0};
+ yajl_val info, ret_obj, stats_obj, n;
+ xenstat_vbd vbd;
+ xenstat_domain *domain;
+ unsigned int sector_size = 512;
+ int i, j;
+
+ /* Use libyajl version 2.0.3 or newer for the tree parser feature */
+ if ((info = yajl_tree_parse((char *)stats_buf, NULL, 0)) == NULL)
+ return;
+
+ ptr[0] = qstats[QMP_STATS_RETURN]; /* "return" */
+ if ((ret_obj = yajl_tree_get(info, ptr, yajl_t_array)) == NULL)
+ goto done;
+
+ /* Array of devices */
+ for (i=0; i<YAJL_GET_ARRAY(ret_obj)->len; i++) {
+ memset(&vbd, 0, sizeof(xenstat_vbd));
+ qmp_devname = NULL;
+ stats_obj = YAJL_GET_ARRAY(ret_obj)->values[i];
+
+ ptr[0] = qstats[QMP_STATS_DEVICE]; /* "device" */
+ if ((n = yajl_tree_get(stats_obj, ptr, yajl_t_any)) != NULL)
+ qmp_devname = YAJL_GET_STRING(n);
+
+ ptr[0] = qstats[QMP_STATS]; /* "stats" */
+ stats_obj = yajl_tree_get(stats_obj, ptr, yajl_t_object);
+ if (stats_obj && YAJL_IS_OBJECT(stats_obj)) {
+ for (j=3; j<7; j++) {
+ ptr[0] = qstats[j];
+ n = yajl_tree_get(stats_obj, ptr, yajl_t_number);
+ if (n && YAJL_IS_NUMBER(n)) {
+ switch(j) {
+ case QMP_RD_BYTES: /* "rd_bytes" */
+ vbd.rd_sects = YAJL_GET_INTEGER(n) / sector_size;
+ break;
+ case QMP_WR_BYTES: /* "wr_bytes" */
+ vbd.wr_sects = YAJL_GET_INTEGER(n) / sector_size;
+ break;
+ case QMP_RD_OPERATIONS: /* "rd_operations" */
+ vbd.rd_reqs = YAJL_GET_INTEGER(n);
+ break;
+ case QMP_WR_OPERATIONS: /* "wr_operations" */
+ vbd.wr_reqs = YAJL_GET_INTEGER(n);
+ break;
+ }
+ }
+ }
+ /* With the QMP device name, lookup the xenstore qdisk device ID and set vdb.dev */
+ if (qmp_devname)
+ lookup_xenstore_devid(node, domid, qmp_devname, qfd, &vbd.dev, &sector_size);
+ if ((domain = xenstat_node_domain(node, domid)) == NULL)
+ continue;
+ if ((xenstat_save_vbd(domain, &vbd)) == NULL)
+ goto done;
+ }
+ }
+done:
+ yajl_tree_free(info);
+}
+
+/* Write a command via the QMP. Returns number of bytes written */
+static size_t qmp_write(int qfd, char *cmd, size_t cmd_len)
+{
+ size_t pos = 0;
+ ssize_t res;
+
+ while (cmd_len > pos) {
+ res = write(qfd, cmd + pos, cmd_len - pos);
+ switch (res) {
+ case -1:
+ if (errno == EINTR || errno == EAGAIN)
+ continue;
+ return 0;
+ case 0:
+ errno = EPIPE;
+ return pos;
+ default:
+ pos += (size_t)res;
+ }
+ }
+ return pos;
+}
+
+/* Read the data sent in response to a QMP execute query. Returns 1 for success */
+static int qmp_read(int qfd, unsigned char **qstats)
+{
+ unsigned char buf[1024], *ptr;
+ struct pollfd pfd[2];
+ int n, qsize = 0;
+
+ *qstats = NULL;
+ pfd[0].fd = qfd;
+ pfd[0].events = POLLIN;
+ while ((n = poll(pfd, POLLIN, 10)) > 0) {
+ if (pfd[0].revents & POLLIN) {
+ if ((n = read(qfd, buf, sizeof(buf))) < 0) {
+ free(*qstats);
+ return 0;
+ }
+ ptr = realloc(*qstats, qsize+n+1);
+ if (ptr == NULL) {
+ free(*qstats);
+ return 0;
+ }
+ memcpy(&ptr[qsize], buf, n);
+ qsize += n;
+ ptr[qsize] = 0;
+ *qstats = ptr;
+ }
+ }
+ return 1;
+}
+
+/* With the given cmd, query QMP for requested data. Returns allocated buffer containing data or NULL */
+static unsigned char *qmp_query(int qfd, char *cmd)
+{
+ unsigned char *qstats = NULL;
+ int n;
+
+ n = strlen(cmd);
+ if (qmp_write(qfd, cmd, n) != n)
+ return NULL;
+ if (!qmp_read(qfd, &qstats))
+ return NULL;
+ return qstats;
+}
+
+/* Returns a socket connected to the QMP socket. Returns -1 on failure. */
+static int qmp_connect(char *path)
+{
+ struct sockaddr_un sun;
+ int s;
+
+ if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
+ return -1;
+ (void)fcntl(s, F_SETFD, 1);
+
+ memset(&sun, 0, sizeof(struct sockaddr_un));
+ sun.sun_family = AF_UNIX;
+
+ if (strlen(path) >= sizeof(sun.sun_path)) {
+ close(s);
+ return -1;
+ }
+
+ strcpy(sun.sun_path, path);
+ if (connect(s, (struct sockaddr *)&sun, SUN_LEN(&sun)) < 0) {
+ close(s);
+ return -1;
+ }
+
+ return s;
+}
+
+/* Get up to 1024 active domains */
+static xc_domaininfo_t *get_domain_ids(int *num_doms)
+{
+ xc_domaininfo_t *dominfo;
+ xc_interface *xc_handle;
+
+ dominfo = calloc(1024, sizeof(xc_domaininfo_t));
+ if (dominfo == NULL)
+ return NULL;
+ xc_handle = xc_interface_open(0,0,0);
+ *num_doms = xc_domain_getinfolist(xc_handle, 0, 1024, dominfo);
+ xc_interface_close(xc_handle);
+ return dominfo;
+}
+
+/* Gather the qdisk statistics by querying QMP
+ Resources: http://wiki.qemu.org/QMP and qmp-commands.hx from the qemu code
+ QMP Syntax for entering command mode. This command must be issued before
+ issuing any other command:
+ In: {"execute": "qmp_capabilities"}
+ Out: {"return": {}}
+ QMP Syntax for querying block statistics:
+ In: { "execute": "query-blockstats" }
+ Out: {"return": [{
+ "device": 'str',
+ "parent": {
+ "stats": {
+ "flush_total_time_ns": 'int', "wr_highest_offset": 'int',
+ "wr_total_time_ns": 'int', "wr_bytes": 'int',
+ "rd_total_time_ns": 'int', "flush_operations": 'int',
+ "wr_operations": 'int', "rd_bytes": 'int', "rd_operations": 'int'
+ }
+ },
+ "stats": {
+ "flush_total_time_ns": 'int', "wr_highest_offset": 'int',
+ "wr_total_time_ns": 'int', "wr_bytes": 'int',
+ "rd_total_time_ns": 'int', "flush_operations": 'int',
+ "wr_operations": 'int', "rd_bytes": 'int', "rd_operations": 'int'
+ }
+ }]}
+*/
+void read_attributes_qdisk(xenstat_node * node)
+{
+ char *cmd_mode = "{ \"execute\": \"qmp_capabilities\" }";
+ char *query_blockstats_cmd = "{ \"execute\": \"query-blockstats\" }";
+ xc_domaininfo_t *dominfo = NULL;
+ unsigned char *qmp_stats, *val;
+ char path[80];
+ int i, qfd, num_doms;
+
+ dominfo = get_domain_ids(&num_doms);
+ if (dominfo == NULL)
+ return;
+
+ for (i=0; i<num_doms; i++) {
+ if (dominfo[i].domain <= 0)
+ continue;
+
+ /* Verify that qdisk disks are used with this VM */
+ snprintf(path, sizeof(path),"/local/domain/0/backend/qdisk/%i", dominfo[i].domain);
+ if ((val = xs_read(node->handle->xshandle, XBT_NULL, path, NULL)) == NULL)
+ continue;
+ free(val);
+
+ /* Connect to this VMs QMP socket */
+ snprintf(path, sizeof(path), "/var/run/xen/qmp-libxenstat-%i", dominfo[i].domain);
+ if ((qfd = qmp_connect(path)) < 0) {
+ continue;
+ }
+
+ /* First enable QMP capabilities so that we can query for data */
+ if ((qmp_stats = qmp_query(qfd, cmd_mode)) != NULL) {
+ free(qmp_stats);
+ /* Query QMP for this VMs blockstats */
+ if ((qmp_stats = qmp_query(qfd, query_blockstats_cmd)) != NULL) {
+ qmp_parse_stats(node, dominfo[i].domain, qmp_stats, qfd);
+ free(qmp_stats);
+ }
+ }
+ close(qfd);
+ }
+
+ free(dominfo);
+}
+
+#else /* !HAVE_YAJL_V2 */
+
+/* Statistics gathering for qdisks requires at least yajl v2 */
+void read_attributes_qdisk(xenstat_node * node)
+{
+}
+
+#endif /* !HAVE_YAJL_V2 */

View File

@ -1,24 +0,0 @@
# Commit e59abf8c8c9c1d99a531292c6a548d6dfd0ceacc
# Date 2015-04-14 14:59:53 +0200
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/traps: identify the vcpu in context when dumping registers
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -53,9 +53,11 @@ static void _show_registers(
printk("\nRFLAGS: %016lx ", regs->rflags);
if ( (context == CTXT_pv_guest) && v && v->vcpu_info )
printk("EM: %d ", !!vcpu_info(v, evtchn_upcall_mask));
- printk("CONTEXT: %s\n", context_names[context]);
+ printk("CONTEXT: %s", context_names[context]);
+ if ( v && !is_idle_vcpu(v) )
+ printk(" (%pv)", v);
- printk("rax: %016lx rbx: %016lx rcx: %016lx\n",
+ printk("\nrax: %016lx rbx: %016lx rcx: %016lx\n",
regs->rax, regs->rbx, regs->rcx);
printk("rdx: %016lx rsi: %016lx rdi: %016lx\n",
regs->rdx, regs->rsi, regs->rdi);

View File

@ -1,41 +0,0 @@
# Commit 63dcef9fe5b880007075b5eb53f9950a826519ce
# Date 2015-04-14 15:02:10 +0200
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/hvm: don't include asm/spinlock.h
asm/spinlock.h should not be included directly.
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -52,7 +52,6 @@
#include <asm/xstate.h>
#include <asm/traps.h>
#include <asm/mc146818rtc.h>
-#include <asm/spinlock.h>
#include <asm/mce.h>
#include <asm/hvm/hvm.h>
#include <asm/hvm/vpt.h>
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -41,7 +41,6 @@
#include <asm/msr.h>
#include <asm/i387.h>
#include <asm/iocap.h>
-#include <asm/spinlock.h>
#include <asm/hvm/emulate.h>
#include <asm/hvm/hvm.h>
#include <asm/hvm/support.h>
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -35,7 +35,6 @@
#include <asm/types.h>
#include <asm/debugreg.h>
#include <asm/msr.h>
-#include <asm/spinlock.h>
#include <asm/paging.h>
#include <asm/p2m.h>
#include <asm/mem_sharing.h>

View File

@ -1,22 +0,0 @@
# Commit f70df9ec1ab72b6bbebad72d81109c1b214007e1
# Date 2015-04-14 15:02:32 +0200
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/mtrr: include asm/atomic.h
asm/atomic.h is needed but only included indirectly via
asm/spinlock.h.
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/cpu/mtrr/main.c
+++ b/xen/arch/x86/cpu/mtrr/main.c
@@ -36,6 +36,7 @@
#include <xen/lib.h>
#include <xen/smp.h>
#include <xen/spinlock.h>
+#include <asm/atomic.h>
#include <asm/mtrr.h>
#include <asm/uaccess.h>
#include <asm/processor.h>

View File

@ -1,46 +0,0 @@
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
# Commit 70a3cbb8c9cb17a61fa25c48ba3d7b44fd059c90
# Date 2015-04-14 16:50:35 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/vMSI-X: honor all mask requests
Commit 74fd0036de ("x86: properly handle MSI-X unmask operation from
guests") didn't go far enough: it fixed an issue with unmasking, but
left an issue with masking in place: Due to the (late) point in time
when qemu requests the hypervisor to set up MSI-X interrupts (which is
where the MMIO intercept gets put in place), the hypervisor doesn't
see all guest writes, and hence shouldn't make assumptions on the state
the virtual MSI-X resources are in. Bypassing the rest of the logic on
a guest mask operation leads to
[00:04.0] pci_msix_write: Error: Can't update msix entry 1 since MSI-X is already enabled.
which surprisingly enough doesn't lead to the device not working
anymore (I didn't dig in deep enough to figure out why that is). But it
does prevent the IRQ to be migrated inside the guest, i.e. all
interrupts will always arrive in vCPU 0.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -286,11 +286,11 @@ static int msixtbl_write(struct vcpu *v,
goto out;
}
- /* exit to device model if address/data has been modified */
- if ( test_and_clear_bit(nr_entry, &entry->table_flags) )
+ /* Exit to device model when unmasking and address/data got modified. */
+ if ( !(val & PCI_MSIX_VECTOR_BITMASK) &&
+ test_and_clear_bit(nr_entry, &entry->table_flags) )
{
- if ( !(val & PCI_MSIX_VECTOR_BITMASK) )
- v->arch.hvm_vcpu.hvm_io.msix_unmask_address = address;
+ v->arch.hvm_vcpu.hvm_io.msix_unmask_address = address;
goto out;
}

View File

@ -1,58 +0,0 @@
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
# Commit df9f5676b3711c95127d44e871ad7ca38d6ed28a
# Date 2015-04-14 16:51:18 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/vMSI-X: add valid bits for read acceleration
Again because Xen doesn't get to see all guest writes, it shouldn't
serve reads from its cache before having seen a write to the respective
address.
Also use DECLARE_BITMAP() in a related field declaration instead of
open coding it.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -154,11 +154,14 @@ struct msixtbl_entry
struct pci_dev *pdev;
unsigned long gtable; /* gpa of msix table */
unsigned long table_len;
- unsigned long table_flags[BITS_TO_LONGS(MAX_MSIX_TABLE_ENTRIES)];
+ DECLARE_BITMAP(table_flags, MAX_MSIX_TABLE_ENTRIES);
#define MAX_MSIX_ACC_ENTRIES 3
struct {
uint32_t msi_ad[3]; /* Shadow of address low, high and data */
} gentries[MAX_MSIX_ACC_ENTRIES];
+ DECLARE_BITMAP(acc_valid, 3 * MAX_MSIX_ACC_ENTRIES);
+#define acc_bit(what, ent, slot, idx) \
+ what##_bit((slot) * 3 + (idx), (ent)->acc_valid)
struct rcu_head rcu;
};
@@ -233,9 +236,10 @@ static int msixtbl_read(
if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
{
nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
- if ( nr_entry >= MAX_MSIX_ACC_ENTRIES )
- goto out;
index = offset / sizeof(uint32_t);
+ if ( nr_entry >= MAX_MSIX_ACC_ENTRIES ||
+ !acc_bit(test, entry, nr_entry, index) )
+ goto out;
*pval = entry->gentries[nr_entry].msi_ad[index];
}
else
@@ -281,6 +285,7 @@ static int msixtbl_write(struct vcpu *v,
{
index = offset / sizeof(uint32_t);
entry->gentries[nr_entry].msi_ad[index] = val;
+ acc_bit(set, entry, nr_entry, index);
}
set_bit(nr_entry, &entry->table_flags);
goto out;

View File

@ -1,27 +0,0 @@
References: bsc#945164
Subject: libxl: use DEBUG log level instead of INFO
From: Wei Liu wei.liu2@citrix.com Fri Apr 17 12:31:29 2015 +0100
Date: Wed Apr 22 14:40:40 2015 +0100:
Git: ddc17f311099c1f0f37a771a2f5f904d848102f7
Make libxl less noisy when destroying a domain.
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
Cc: Ian Jackson <ian.jackson@eu.citrix.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Index: xen-4.5.1-testing/tools/libxl/libxl.c
===================================================================
--- xen-4.5.1-testing.orig/tools/libxl/libxl.c
+++ xen-4.5.1-testing/tools/libxl/libxl.c
@@ -1695,7 +1695,7 @@ static void devices_destroy_cb(libxl__eg
_exit(-1);
}
}
- LOG(INFO, "forked pid %ld for destroy of domain %d", (long)rc, domid);
+ LOG(DEBUG, "forked pid %ld for destroy of domain %d", (long)rc, domid);
return;

View File

@ -1,33 +0,0 @@
# Commit 017e667c433a1040306db6265b05e104568c70c8
# Date 2015-05-05 18:00:03 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
domctl: don't truncate XEN_DOMCTL_max_mem requests
Instead saturate the value if the input can't be represented in the
respective struct domain field.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Acked-by: Tim Deegan <tim@xen.org>
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -943,7 +943,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
case XEN_DOMCTL_max_mem:
{
- unsigned long new_max;
+ uint64_t new_max;
ret = -EINVAL;
new_max = op->u.max_mem.max_memkb >> (PAGE_SHIFT-10);
@@ -954,7 +954,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
* that the domain will now be allowed to "ratchet" down to new_max. In
* the meantime, while tot > max, all new allocations are disallowed.
*/
- d->max_pages = new_max;
+ d->max_pages = min(new_max, (uint64_t)(typeof(d->max_pages))-1);
ret = 0;
spin_unlock(&d->page_alloc_lock);
}

View File

@ -1,250 +0,0 @@
# Commit d72a4605e18d3a61c4469ff092dbbbfa4ac919f7
# Date 2015-05-05 18:01:33 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86: allow 64-bit PV guest kernels to suppress user mode exposure of M2P
Xen L4 entries being uniformly installed into any L4 table and 64-bit
PV kernels running in ring 3 means that user mode was able to see the
read-only M2P presented by Xen to the guests. While apparently not
really representing an exploitable information leak, this still very
certainly was never meant to be that way.
Building on the fact that these guests already have separate kernel and
user mode page tables we can allow guest kernels to tell Xen that they
don't want user mode to see this table. We can't, however, do this by
default: There is no ABI requirement that kernel and user mode page
tables be separate. Therefore introduce a new VM-assist flag allowing
the guest to control respective hypervisor behavior:
- when not set, L4 tables get created with the respective slot blank,
and whenever the L4 table gets used as a kernel one the missing
mapping gets inserted,
- when set, L4 tables get created with the respective slot initialized
as before, and whenever the L4 table gets used as a user one the
mapping gets zapped.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Tim Deegan <tim@xen.org>
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -338,7 +338,7 @@ static int setup_compat_l4(struct vcpu *
l4tab = __map_domain_page(pg);
clear_page(l4tab);
- init_guest_l4_table(l4tab, v->domain);
+ init_guest_l4_table(l4tab, v->domain, 1);
unmap_domain_page(l4tab);
v->arch.guest_table = pagetable_from_page(pg);
@@ -977,7 +977,11 @@ int arch_set_info_guest(
case -EINTR:
rc = -ERESTART;
case -ERESTART:
+ break;
case 0:
+ if ( !compat && !VM_ASSIST(d, VMASST_TYPE_m2p_strict) &&
+ !paging_mode_refcounts(d) )
+ fill_ro_mpt(cr3_gfn);
break;
default:
if ( cr3_page == current->arch.old_guest_table )
@@ -1012,7 +1016,10 @@ int arch_set_info_guest(
default:
if ( cr3_page == current->arch.old_guest_table )
cr3_page = NULL;
+ break;
case 0:
+ if ( VM_ASSIST(d, VMASST_TYPE_m2p_strict) )
+ zap_ro_mpt(cr3_gfn);
break;
}
}
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -1092,7 +1092,7 @@ int __init construct_dom0(
l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
}
clear_page(l4tab);
- init_guest_l4_table(l4tab, d);
+ init_guest_l4_table(l4tab, d, 0);
v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
if ( is_pv_32on64_domain(d) )
v->arch.guest_table_user = v->arch.guest_table;
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -1380,7 +1380,8 @@ static int alloc_l3_table(struct page_in
return rc > 0 ? 0 : rc;
}
-void init_guest_l4_table(l4_pgentry_t l4tab[], const struct domain *d)
+void init_guest_l4_table(l4_pgentry_t l4tab[], const struct domain *d,
+ bool_t zap_ro_mpt)
{
/* Xen private mappings. */
memcpy(&l4tab[ROOT_PAGETABLE_FIRST_XEN_SLOT],
@@ -1395,6 +1396,25 @@ void init_guest_l4_table(l4_pgentry_t l4
l4e_from_pfn(domain_page_map_to_mfn(l4tab), __PAGE_HYPERVISOR);
l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
l4e_from_page(d->arch.perdomain_l3_pg, __PAGE_HYPERVISOR);
+ if ( zap_ro_mpt || is_pv_32on64_domain(d) || paging_mode_refcounts(d) )
+ l4tab[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty();
+}
+
+void fill_ro_mpt(unsigned long mfn)
+{
+ l4_pgentry_t *l4tab = map_domain_page(mfn);
+
+ l4tab[l4_table_offset(RO_MPT_VIRT_START)] =
+ idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)];
+ unmap_domain_page(l4tab);
+}
+
+void zap_ro_mpt(unsigned long mfn)
+{
+ l4_pgentry_t *l4tab = map_domain_page(mfn);
+
+ l4tab[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty();
+ unmap_domain_page(l4tab);
}
static int alloc_l4_table(struct page_info *page)
@@ -1444,7 +1464,7 @@ static int alloc_l4_table(struct page_in
adjust_guest_l4e(pl4e[i], d);
}
- init_guest_l4_table(pl4e, d);
+ init_guest_l4_table(pl4e, d, !VM_ASSIST(d, VMASST_TYPE_m2p_strict));
unmap_domain_page(pl4e);
return rc > 0 ? 0 : rc;
@@ -2755,6 +2775,8 @@ int new_guest_cr3(unsigned long mfn)
invalidate_shadow_ldt(curr, 0);
+ if ( !VM_ASSIST(d, VMASST_TYPE_m2p_strict) && !paging_mode_refcounts(d) )
+ fill_ro_mpt(mfn);
curr->arch.guest_table = pagetable_from_pfn(mfn);
update_cr3(curr);
@@ -3111,6 +3133,9 @@ long do_mmuext_op(
op.arg1.mfn);
break;
}
+ if ( VM_ASSIST(d, VMASST_TYPE_m2p_strict) &&
+ !paging_mode_refcounts(d) )
+ zap_ro_mpt(op.arg1.mfn);
}
curr->arch.guest_table_user = pagetable_from_pfn(op.arg1.mfn);
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -1438,6 +1438,13 @@ void sh_install_xen_entries_in_l4(struct
shadow_l4e_from_mfn(page_to_mfn(d->arch.perdomain_l3_pg),
__PAGE_HYPERVISOR);
+ if ( !shadow_mode_external(d) && !is_pv_32on64_domain(d) &&
+ !VM_ASSIST(d, VMASST_TYPE_m2p_strict) )
+ {
+ /* open coded zap_ro_mpt(mfn_x(sl4mfn)): */
+ sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] = shadow_l4e_empty();
+ }
+
/* Shadow linear mapping for 4-level shadows. N.B. for 3-level
* shadows on 64-bit xen, this linear mapping is later replaced by the
* monitor pagetable structure, which is built in make_monitor_table
@@ -4062,6 +4069,17 @@ sh_update_cr3(struct vcpu *v, int do_loc
if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 )
flush_tlb_mask(d->domain_dirty_cpumask);
sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
+ if ( !shadow_mode_external(d) && !is_pv_32on64_domain(d) )
+ {
+ mfn_t smfn = pagetable_get_mfn(v->arch.shadow_table[0]);
+
+ if ( !(v->arch.flags & TF_kernel_mode) &&
+ VM_ASSIST(d, VMASST_TYPE_m2p_strict) )
+ zap_ro_mpt(mfn_x(smfn));
+ else if ( (v->arch.flags & TF_kernel_mode) &&
+ !VM_ASSIST(d, VMASST_TYPE_m2p_strict) )
+ fill_ro_mpt(mfn_x(smfn));
+ }
#else
#error This should never happen
#endif
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -480,7 +480,7 @@ static int setup_m2p_table(struct mem_ho
l2_ro_mpt += l2_table_offset(va);
}
- /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
+ /* NB. Cannot be GLOBAL: guest user mode should not see it. */
l2e_write(l2_ro_mpt, l2e_from_pfn(mfn,
/*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
}
@@ -583,7 +583,7 @@ void __init paging_init(void)
0x77, 1UL << L3_PAGETABLE_SHIFT);
ASSERT(!l2_table_offset(va));
- /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
+ /* NB. Cannot be GLOBAL: guest user mode should not see it. */
l3e_write(&l3_ro_mpt[l3_table_offset(va)],
l3e_from_page(l1_pg,
/*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
@@ -621,7 +621,7 @@ void __init paging_init(void)
l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
ASSERT(!l2_table_offset(va));
}
- /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
+ /* NB. Cannot be GLOBAL: guest user mode should not see it. */
if ( l1_pg )
l2e_write(l2_ro_mpt, l2e_from_page(
l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -330,7 +330,8 @@ extern unsigned long xen_phys_start;
#define NATIVE_VM_ASSIST_VALID ((1UL << VMASST_TYPE_4gb_segments) | \
(1UL << VMASST_TYPE_4gb_segments_notify) | \
(1UL << VMASST_TYPE_writable_pagetables) | \
- (1UL << VMASST_TYPE_pae_extended_cr3))
+ (1UL << VMASST_TYPE_pae_extended_cr3) | \
+ (1UL << VMASST_TYPE_m2p_strict))
#define VM_ASSIST_VALID NATIVE_VM_ASSIST_VALID
#define COMPAT_VM_ASSIST_VALID (NATIVE_VM_ASSIST_VALID & \
((1UL << COMPAT_BITS_PER_LONG) - 1))
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -314,7 +314,10 @@ static inline void *__page_to_virt(const
int free_page_type(struct page_info *page, unsigned long type,
int preemptible);
-void init_guest_l4_table(l4_pgentry_t[], const struct domain *);
+void init_guest_l4_table(l4_pgentry_t[], const struct domain *,
+ bool_t zap_ro_mpt);
+void fill_ro_mpt(unsigned long mfn);
+void zap_ro_mpt(unsigned long mfn);
int is_iomem_page(unsigned long mfn);
--- a/xen/include/public/xen.h
+++ b/xen/include/public/xen.h
@@ -486,6 +486,18 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
/* x86/PAE guests: support PDPTs above 4GB. */
#define VMASST_TYPE_pae_extended_cr3 3
+/*
+ * x86/64 guests: strictly hide M2P from user mode.
+ * This allows the guest to control respective hypervisor behavior:
+ * - when not set, L4 tables get created with the respective slot blank,
+ * and whenever the L4 table gets used as a kernel one the missing
+ * mapping gets inserted,
+ * - when set, L4 tables get created with the respective slot initialized
+ * as before, and whenever the L4 table gets used as a user one the
+ * mapping gets zapped.
+ */
+#define VMASST_TYPE_m2p_strict 32
+
#if __XEN_INTERFACE_VERSION__ < 0x00040600
#define MAX_VMASST_TYPE 3
#endif

View File

@ -1,68 +0,0 @@
# Commit 2bfc9fc52ce8485fa43e79bbdc32360c74e12fe8
# Date 2015-05-08 10:59:26 +0200
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86: provide arch_fetch_and_add()
arch_fetch_and_add() atomically adds a value and returns the previous
value.
This is needed to implement ticket locks.
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
--- a/xen/include/asm-x86/system.h
+++ b/xen/include/asm-x86/system.h
@@ -118,6 +118,52 @@ static always_inline unsigned long __cmp
})
/*
+ * Undefined symbol to cause link failure if a wrong size is used with
+ * arch_fetch_and_add().
+ */
+extern unsigned long __bad_fetch_and_add_size(void);
+
+static always_inline unsigned long __xadd(
+ volatile void *ptr, unsigned long v, int size)
+{
+ switch ( size )
+ {
+ case 1:
+ asm volatile ( "lock; xaddb %b0,%1"
+ : "+r" (v), "+m" (*__xg(ptr))
+ :: "memory");
+ return v;
+ case 2:
+ asm volatile ( "lock; xaddw %w0,%1"
+ : "+r" (v), "+m" (*__xg(ptr))
+ :: "memory");
+ return v;
+ case 4:
+ asm volatile ( "lock; xaddl %k0,%1"
+ : "+r" (v), "+m" (*__xg(ptr))
+ :: "memory");
+ return v;
+ case 8:
+ asm volatile ( "lock; xaddq %q0,%1"
+ : "+r" (v), "+m" (*__xg(ptr))
+ :: "memory");
+
+ return v;
+ default:
+ return __bad_fetch_and_add_size();
+ }
+}
+
+/*
+ * Atomically add @v to the 1, 2, 4, or 8 byte value at @ptr. Returns
+ * the previous value.
+ *
+ * This is a full memory barrier.
+ */
+#define arch_fetch_and_add(ptr, v) \
+ ((typeof(*(ptr)))__xadd(ptr, (typeof(*(ptr)))(v), sizeof(*(ptr))))
+
+/*
* Both Intel and AMD agree that, from a programmer's viewpoint:
* Loads cannot be reordered relative to other loads.
* Stores cannot be reordered relative to other stores.

View File

@ -1,29 +0,0 @@
# Commit f9cc3cd9b4de58cf032c8624406384c172937e57
# Date 2015-05-08 10:59:44 +0200
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
arm: provide arch_fetch_and_add()
arch_fetch_and_add() atomically adds a value and returns the previous
value.
This generic arm implementation uses the GCC __sync_fetch_and_add()
builtin. This builtin resulted in suitable inlined asm for GCC 4.8.3
(arm64) and GCC 4.6.3 (arm32).
This is needed to implement ticket locks.
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
--- a/xen/include/asm-arm/system.h
+++ b/xen/include/asm-arm/system.h
@@ -51,6 +51,8 @@
# error "unknown ARM variant"
#endif
+#define arch_fetch_and_add(x, v) __sync_fetch_and_add(x, v)
+
extern struct vcpu *__context_switch(struct vcpu *prev, struct vcpu *next);
#endif

View File

@ -1,155 +0,0 @@
commit 161212ef02312c0681d2d809c8ff1e1f0ea6f6f9
Author: Fabio Fantoni <fabio.fantoni@m2r.biz>
Date: Wed Apr 29 11:20:28 2015 +0200
libxl: Add qxl vga interface support for upstream qemu
Usage:
vga="qxl"
Qxl vga support many resolutions that not supported by stdvga,
mainly the 16:9 ones and other high up to 2560x1600.
With QXL you can get improved performance and smooth video also
with high resolutions and high quality.
Require their drivers installed in the domU and spice used
otherwise act as a simple stdvga.
Signed-off-by: Fabio Fantoni <fabio.fantoni@m2r.biz>
Signed-off-by: Zhou Peng <zpengxen@gmail.com>
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Acked-by: Ian Jackson <ian.jackson@eu.citrix.com>
Acked-by: George Dunlap <george.dunlap@eu.citrix.com>
Index: xen-4.5.1-testing/docs/man/xl.cfg.pod.5
===================================================================
--- xen-4.5.1-testing.orig/docs/man/xl.cfg.pod.5
+++ xen-4.5.1-testing/docs/man/xl.cfg.pod.5
@@ -1292,6 +1292,9 @@ qemu-xen-traditional device-model, the a
which is sufficient for 1024x768 at 32 bpp. For the upstream qemu-xen
device-model, the default and minimum is 8 MB.
+For B<qxl> vga, the default is both default and minimal 128MB.
+If B<videoram> is set less than 128MB, an error will be triggered.
+
=item B<stdvga=BOOLEAN>
Select a standard VGA card with VBE (VESA BIOS Extensions) as the
@@ -1303,9 +1306,14 @@ This option is deprecated, use vga="stdv
=item B<vga="STRING">
-Selects the emulated video card (none|stdvga|cirrus).
+Selects the emulated video card (none|stdvga|cirrus|qxl).
The default is cirrus.
+In general, QXL should work with the Spice remote display protocol
+for acceleration, and QXL driver is necessary in guest in this case.
+QXL can also work with the VNC protocol, but it will be like a standard
+VGA without acceleration.
+
=item B<vnc=BOOLEAN>
Allow access to the display via the VNC protocol. This enables the
Index: xen-4.5.1-testing/tools/libxl/libxl.h
===================================================================
--- xen-4.5.1-testing.orig/tools/libxl/libxl.h
+++ xen-4.5.1-testing/tools/libxl/libxl.h
@@ -506,6 +506,16 @@ typedef struct libxl__ctx libxl_ctx;
#define LIBXL_HAVE_DOMINFO_OUTSTANDING_MEMKB 1
/*
+ * LIBXL_HAVE_QXL
+ *
+ * If defined, then the libxl_vga_interface_type will contain another value:
+ * "QXL". This value define if qxl vga is supported.
+ *
+ * If this is not defined, the qxl vga support is missed.
+ */
+#define LIBXL_HAVE_QXL 1
+
+/*
* LIBXL_HAVE_SPICE_VDAGENT
*
* If defined, then the libxl_spice_info structure will contain a boolean type:
Index: xen-4.5.1-testing/tools/libxl/libxl_create.c
===================================================================
--- xen-4.5.1-testing.orig/tools/libxl/libxl_create.c
+++ xen-4.5.1-testing/tools/libxl/libxl_create.c
@@ -240,6 +240,10 @@ int libxl__domain_build_info_setdefault(
if (b_info->video_memkb == LIBXL_MEMKB_DEFAULT)
b_info->video_memkb = 0;
break;
+ case LIBXL_VGA_INTERFACE_TYPE_QXL:
+ LOG(ERROR,"qemu upstream required for qxl vga");
+ return ERROR_INVAL;
+ break;
case LIBXL_VGA_INTERFACE_TYPE_STD:
if (b_info->video_memkb == LIBXL_MEMKB_DEFAULT)
b_info->video_memkb = 8 * 1024;
@@ -264,6 +268,15 @@ int libxl__domain_build_info_setdefault(
if (b_info->video_memkb == LIBXL_MEMKB_DEFAULT)
b_info->video_memkb = 0;
break;
+ case LIBXL_VGA_INTERFACE_TYPE_QXL:
+ if (b_info->video_memkb == LIBXL_MEMKB_DEFAULT) {
+ b_info->video_memkb = (128 * 1024);
+ } else if (b_info->video_memkb < (128 * 1024)) {
+ LOG(ERROR,
+ "128 Mib videoram is the minimum for qxl default");
+ return ERROR_INVAL;
+ }
+ break;
case LIBXL_VGA_INTERFACE_TYPE_STD:
if (b_info->video_memkb == LIBXL_MEMKB_DEFAULT)
b_info->video_memkb = 16 * 1024;
Index: xen-4.5.1-testing/tools/libxl/libxl_dm.c
===================================================================
--- xen-4.5.1-testing.orig/tools/libxl/libxl_dm.c
+++ xen-4.5.1-testing/tools/libxl/libxl_dm.c
@@ -251,6 +251,8 @@ static char ** libxl__build_device_model
case LIBXL_VGA_INTERFACE_TYPE_NONE:
flexarray_append_pair(dm_args, "-vga", "none");
break;
+ case LIBXL_VGA_INTERFACE_TYPE_QXL:
+ break;
}
if (b_info->u.hvm.boot) {
@@ -616,6 +618,12 @@ static char ** libxl__build_device_model
break;
case LIBXL_VGA_INTERFACE_TYPE_NONE:
break;
+ case LIBXL_VGA_INTERFACE_TYPE_QXL:
+ /* QXL have 2 ram regions, ram and vram */
+ flexarray_append_pair(dm_args, "-device",
+ GCSPRINTF("qxl-vga,vram_size_mb=%"PRIu64",ram_size_mb=%"PRIu64,
+ (b_info->video_memkb/2/1024), (b_info->video_memkb/2/1024) ) );
+ break;
}
if (b_info->u.hvm.boot) {
Index: xen-4.5.1-testing/tools/libxl/libxl_types.idl
===================================================================
--- xen-4.5.1-testing.orig/tools/libxl/libxl_types.idl
+++ xen-4.5.1-testing/tools/libxl/libxl_types.idl
@@ -181,6 +181,7 @@ libxl_vga_interface_type = Enumeration("
(1, "CIRRUS"),
(2, "STD"),
(3, "NONE"),
+ (4, "QXL"),
], init_val = "LIBXL_VGA_INTERFACE_TYPE_CIRRUS")
libxl_vendor_device = Enumeration("vendor_device", [
Index: xen-4.5.1-testing/tools/libxl/xl_cmdimpl.c
===================================================================
--- xen-4.5.1-testing.orig/tools/libxl/xl_cmdimpl.c
+++ xen-4.5.1-testing/tools/libxl/xl_cmdimpl.c
@@ -1910,6 +1910,8 @@ skip_vfb:
b_info->u.hvm.vga.kind = LIBXL_VGA_INTERFACE_TYPE_CIRRUS;
} else if (!strcmp(buf, "none")) {
b_info->u.hvm.vga.kind = LIBXL_VGA_INTERFACE_TYPE_NONE;
+ } else if (!strcmp(buf, "qxl")) {
+ b_info->u.hvm.vga.kind = LIBXL_VGA_INTERFACE_TYPE_QXL;
} else {
fprintf(stderr, "Unknown vga \"%s\" specified\n", buf);
exit(1);

View File

@ -1,65 +0,0 @@
# Commit 3c694aec08dda782d9c866e599b848dff86f474f
# Date 2015-05-13 15:00:58 +0200
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86: provide add_sized()
add_sized(ptr, inc) adds inc to the value at ptr using only the correct
size of loads and stores for the type of *ptr. The add is /not/ atomic.
This is needed for ticket locks to ensure the increment of the head ticket
does not affect the tail ticket.
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
--- a/xen/include/asm-x86/atomic.h
+++ b/xen/include/asm-x86/atomic.h
@@ -14,6 +14,14 @@ static inline void name(volatile type *a
{ asm volatile("mov" size " %1,%0": "=m" (*(volatile type *)addr) \
:reg (val) barrier); }
+#define build_add_sized(name, size, type, reg) \
+ static inline void name(volatile type *addr, type val) \
+ { \
+ asm volatile("add" size " %1,%0" \
+ : "=m" (*addr) \
+ : reg (val)); \
+ }
+
build_read_atomic(read_u8_atomic, "b", uint8_t, "=q", )
build_read_atomic(read_u16_atomic, "w", uint16_t, "=r", )
build_read_atomic(read_u32_atomic, "l", uint32_t, "=r", )
@@ -25,8 +33,14 @@ build_write_atomic(write_u32_atomic, "l"
build_read_atomic(read_u64_atomic, "q", uint64_t, "=r", )
build_write_atomic(write_u64_atomic, "q", uint64_t, "r", )
+build_add_sized(add_u8_sized, "b", uint8_t, "qi")
+build_add_sized(add_u16_sized, "w", uint16_t, "ri")
+build_add_sized(add_u32_sized, "l", uint32_t, "ri")
+build_add_sized(add_u64_sized, "q", uint64_t, "ri")
+
#undef build_read_atomic
#undef build_write_atomic
+#undef build_add_sized
void __bad_atomic_size(void);
@@ -54,6 +68,18 @@ void __bad_atomic_size(void);
__x; \
})
+#define add_sized(p, x) ({ \
+ typeof(*(p)) x_ = (x); \
+ switch ( sizeof(*(p)) ) \
+ { \
+ case 1: add_u8_sized((uint8_t *)(p), x_); break; \
+ case 2: add_u16_sized((uint16_t *)(p), x_); break; \
+ case 4: add_u32_sized((uint32_t *)(p), x_); break; \
+ case 8: add_u64_sized((uint64_t *)(p), x_); break; \
+ default: __bad_atomic_size(); break; \
+ } \
+})
+
/*
* NB. I've pushed the volatile qualifier into the operations. This allows
* fast accessors such as _atomic_read() and _atomic_set() which don't give

View File

@ -1,64 +0,0 @@
# Commit 890674d13feb4a270aa112ca452dcf62fdd53f34
# Date 2015-05-13 15:01:25 +0200
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
arm: provide add_sized()
add_sized(ptr, inc) adds inc to the value at ptr using only the correct
size of loads and stores for the type of *ptr. The add is /not/ atomic.
This is needed for ticket locks to ensure the increment of the head ticket
does not affect the tail ticket.
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
--- a/xen/include/asm-arm/atomic.h
+++ b/xen/include/asm-arm/atomic.h
@@ -23,6 +23,17 @@ static inline void name(volatile type *a
: reg (val)); \
}
+#define build_add_sized(name, size, width, type, reg) \
+static inline void name(volatile type *addr, type val) \
+{ \
+ type t; \
+ asm volatile("ldr" size " %"width"1,%0\n" \
+ "add %"width"1,%"width"1,%"width"2\n" \
+ "str" size " %"width"1,%0" \
+ : "=m" (*(volatile type *)addr), "=r" (t) \
+ : reg (val)); \
+}
+
#if defined (CONFIG_ARM_32)
#define BYTE ""
#define WORD ""
@@ -46,6 +57,10 @@ build_atomic_read(read_u64_atomic, "x",
build_atomic_write(write_u64_atomic, "x", uint64_t, "r")
#endif
+build_add_sized(add_u8_sized, "b", BYTE, uint8_t, "ri")
+build_add_sized(add_u16_sized, "h", WORD, uint16_t, "ri")
+build_add_sized(add_u32_sized, "", WORD, uint32_t, "ri")
+
void __bad_atomic_size(void);
#define read_atomic(p) ({ \
@@ -70,6 +85,17 @@ void __bad_atomic_size(void);
__x; \
})
+#define add_sized(p, x) ({ \
+ typeof(*(p)) __x = (x); \
+ switch ( sizeof(*(p)) ) \
+ { \
+ case 1: add_u8_sized((uint8_t *)(p), __x); break; \
+ case 2: add_u16_sized((uint16_t *)(p), __x); break; \
+ case 4: add_u32_sized((uint32_t *)(p), __x); break; \
+ default: __bad_atomic_size(); break; \
+ } \
+})
+
/*
* NB. I've pushed the volatile qualifier into the operations. This allows
* fast accessors such as _atomic_read() and _atomic_set() which don't give

View File

@ -1,305 +0,0 @@
# Commit 45fcc4568c5162b00fb3907fb158af82dd484a3d
# Date 2015-05-15 09:49:12 +0200
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
use ticket locks for spin locks
Replace the byte locks with ticket locks. Ticket locks are: a) fair;
and b) peform better when contented since they spin without an atomic
operation.
The lock is split into two ticket values: head and tail. A locker
acquires a ticket by (atomically) increasing tail and using the
previous tail value. A CPU holds the lock if its ticket == head. The
lock is released by increasing head.
spin_lock_irq() and spin_lock_irqsave() now spin with irqs disabled
(previously, they would spin with irqs enabled if possible). This is
required to prevent deadlocks when the irq handler tries to take the
same lock with a higher ticket.
Architectures need only provide arch_fetch_and_add() and two barriers:
arch_lock_acquire_barrier() and arch_lock_release_barrier().
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Tim Deegan <tim@xen.org>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/common/spinlock.c
+++ b/xen/common/spinlock.c
@@ -115,125 +115,134 @@ void spin_debug_disable(void)
#endif
+static always_inline spinlock_tickets_t observe_lock(spinlock_tickets_t *t)
+{
+ spinlock_tickets_t v;
+
+ smp_rmb();
+ v.head_tail = read_atomic(&t->head_tail);
+ return v;
+}
+
+static always_inline u16 observe_head(spinlock_tickets_t *t)
+{
+ smp_rmb();
+ return read_atomic(&t->head);
+}
+
void _spin_lock(spinlock_t *lock)
{
+ spinlock_tickets_t tickets = SPINLOCK_TICKET_INC;
LOCK_PROFILE_VAR;
check_lock(&lock->debug);
- while ( unlikely(!_raw_spin_trylock(&lock->raw)) )
+ tickets.head_tail = arch_fetch_and_add(&lock->tickets.head_tail,
+ tickets.head_tail);
+ while ( tickets.tail != observe_head(&lock->tickets) )
{
LOCK_PROFILE_BLOCK;
- while ( likely(_raw_spin_is_locked(&lock->raw)) )
- cpu_relax();
+ cpu_relax();
}
LOCK_PROFILE_GOT;
preempt_disable();
+ arch_lock_acquire_barrier();
}
void _spin_lock_irq(spinlock_t *lock)
{
- LOCK_PROFILE_VAR;
-
ASSERT(local_irq_is_enabled());
local_irq_disable();
- check_lock(&lock->debug);
- while ( unlikely(!_raw_spin_trylock(&lock->raw)) )
- {
- LOCK_PROFILE_BLOCK;
- local_irq_enable();
- while ( likely(_raw_spin_is_locked(&lock->raw)) )
- cpu_relax();
- local_irq_disable();
- }
- LOCK_PROFILE_GOT;
- preempt_disable();
+ _spin_lock(lock);
}
unsigned long _spin_lock_irqsave(spinlock_t *lock)
{
unsigned long flags;
- LOCK_PROFILE_VAR;
local_irq_save(flags);
- check_lock(&lock->debug);
- while ( unlikely(!_raw_spin_trylock(&lock->raw)) )
- {
- LOCK_PROFILE_BLOCK;
- local_irq_restore(flags);
- while ( likely(_raw_spin_is_locked(&lock->raw)) )
- cpu_relax();
- local_irq_save(flags);
- }
- LOCK_PROFILE_GOT;
- preempt_disable();
+ _spin_lock(lock);
return flags;
}
void _spin_unlock(spinlock_t *lock)
{
+ arch_lock_release_barrier();
preempt_enable();
LOCK_PROFILE_REL;
- _raw_spin_unlock(&lock->raw);
+ add_sized(&lock->tickets.head, 1);
}
void _spin_unlock_irq(spinlock_t *lock)
{
- preempt_enable();
- LOCK_PROFILE_REL;
- _raw_spin_unlock(&lock->raw);
+ _spin_unlock(lock);
local_irq_enable();
}
void _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
{
- preempt_enable();
- LOCK_PROFILE_REL;
- _raw_spin_unlock(&lock->raw);
+ _spin_unlock(lock);
local_irq_restore(flags);
}
int _spin_is_locked(spinlock_t *lock)
{
check_lock(&lock->debug);
- return _raw_spin_is_locked(&lock->raw);
+ return lock->tickets.head != lock->tickets.tail;
}
int _spin_trylock(spinlock_t *lock)
{
+ spinlock_tickets_t old, new;
+
check_lock(&lock->debug);
- if ( !_raw_spin_trylock(&lock->raw) )
+ old = observe_lock(&lock->tickets);
+ if ( old.head != old.tail )
+ return 0;
+ new = old;
+ new.tail++;
+ if ( cmpxchg(&lock->tickets.head_tail,
+ old.head_tail, new.head_tail) != old.head_tail )
return 0;
#ifdef LOCK_PROFILE
if (lock->profile)
lock->profile->time_locked = NOW();
#endif
preempt_disable();
+ /*
+ * cmpxchg() is a full barrier so no need for an
+ * arch_lock_acquire_barrier().
+ */
return 1;
}
void _spin_barrier(spinlock_t *lock)
{
+ spinlock_tickets_t sample;
#ifdef LOCK_PROFILE
s_time_t block = NOW();
- u64 loop = 0;
+#endif
check_barrier(&lock->debug);
- do { smp_mb(); loop++;} while ( _raw_spin_is_locked(&lock->raw) );
- if ((loop > 1) && lock->profile)
+ smp_mb();
+ sample = observe_lock(&lock->tickets);
+ if ( sample.head != sample.tail )
{
- lock->profile->time_block += NOW() - block;
- lock->profile->block_cnt++;
- }
-#else
- check_barrier(&lock->debug);
- do { smp_mb(); } while ( _raw_spin_is_locked(&lock->raw) );
+ while ( observe_head(&lock->tickets) == sample.head )
+ cpu_relax();
+#ifdef LOCK_PROFILE
+ if ( lock->profile )
+ {
+ lock->profile->time_block += NOW() - block;
+ lock->profile->block_cnt++;
+ }
#endif
+ }
smp_mb();
}
int _spin_trylock_recursive(spinlock_t *lock)
{
- int cpu = smp_processor_id();
+ unsigned int cpu = smp_processor_id();
/* Don't allow overflow of recurse_cpu field. */
BUILD_BUG_ON(NR_CPUS > 0xfffu);
@@ -256,8 +265,17 @@ int _spin_trylock_recursive(spinlock_t *
void _spin_lock_recursive(spinlock_t *lock)
{
- while ( !spin_trylock_recursive(lock) )
- cpu_relax();
+ unsigned int cpu = smp_processor_id();
+
+ if ( likely(lock->recurse_cpu != cpu) )
+ {
+ _spin_lock(lock);
+ lock->recurse_cpu = cpu;
+ }
+
+ /* We support only fairly shallow recursion, else the counter overflows. */
+ ASSERT(lock->recurse_cnt < 0xfu);
+ lock->recurse_cnt++;
}
void _spin_unlock_recursive(spinlock_t *lock)
--- a/xen/include/asm-arm/system.h
+++ b/xen/include/asm-arm/system.h
@@ -53,6 +53,9 @@
#define arch_fetch_and_add(x, v) __sync_fetch_and_add(x, v)
+#define arch_lock_acquire_barrier() smp_mb()
+#define arch_lock_release_barrier() smp_mb()
+
extern struct vcpu *__context_switch(struct vcpu *prev, struct vcpu *next);
#endif
--- a/xen/include/asm-x86/system.h
+++ b/xen/include/asm-x86/system.h
@@ -185,6 +185,17 @@ static always_inline unsigned long __xad
#define set_mb(var, value) do { xchg(&var, value); } while (0)
#define set_wmb(var, value) do { var = value; wmb(); } while (0)
+/*
+ * On x86 the only reordering is of reads with older writes. In the
+ * lock case, the read in observe_head() can only be reordered with
+ * writes that precede it, and moving a write _into_ a locked section
+ * is OK. In the release case, the write in add_sized() can only be
+ * reordered with reads that follow it, and hoisting a read _into_ a
+ * locked region is OK.
+ */
+#define arch_lock_acquire_barrier() barrier()
+#define arch_lock_release_barrier() barrier()
+
#define local_irq_disable() asm volatile ( "cli" : : : "memory" )
#define local_irq_enable() asm volatile ( "sti" : : : "memory" )
--- a/xen/include/xen/spinlock.h
+++ b/xen/include/xen/spinlock.h
@@ -80,8 +80,7 @@ struct lock_profile_qhead {
static struct lock_profile *__lock_profile_##name \
__used_section(".lockprofile.data") = \
&__lock_profile_data_##name
-#define _SPIN_LOCK_UNLOCKED(x) { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0, \
- _LOCK_DEBUG, x }
+#define _SPIN_LOCK_UNLOCKED(x) { { 0 }, 0xfffu, 0, _LOCK_DEBUG, x }
#define SPIN_LOCK_UNLOCKED _SPIN_LOCK_UNLOCKED(NULL)
#define DEFINE_SPINLOCK(l) \
spinlock_t l = _SPIN_LOCK_UNLOCKED(NULL); \
@@ -117,8 +116,7 @@ extern void spinlock_profile_reset(unsig
struct lock_profile_qhead { };
-#define SPIN_LOCK_UNLOCKED \
- { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0, _LOCK_DEBUG }
+#define SPIN_LOCK_UNLOCKED { { 0 }, 0xfffu, 0, _LOCK_DEBUG }
#define DEFINE_SPINLOCK(l) spinlock_t l = SPIN_LOCK_UNLOCKED
#define spin_lock_init_prof(s, l) spin_lock_init(&((s)->l))
@@ -127,8 +125,18 @@ struct lock_profile_qhead { };
#endif
+typedef union {
+ u32 head_tail;
+ struct {
+ u16 head;
+ u16 tail;
+ };
+} spinlock_tickets_t;
+
+#define SPINLOCK_TICKET_INC { .head_tail = 0x10000, }
+
typedef struct spinlock {
- raw_spinlock_t raw;
+ spinlock_tickets_t tickets;
u16 recurse_cpu:12;
u16 recurse_cnt:4;
struct lock_debug debug;

View File

@ -1,266 +0,0 @@
# Commit e62e49e6d5d4e8d22f3df0b75443ede65a812435
# Date 2015-05-15 09:52:25 +0200
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86,arm: remove asm/spinlock.h from all architectures
Now that all architecture use a common ticket lock implementation for
spinlocks, remove the architecture specific byte lock implementations.
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Tim Deegan <tim@xen.org>
Acked-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
--- a/xen/arch/arm/README.LinuxPrimitives
+++ b/xen/arch/arm/README.LinuxPrimitives
@@ -25,16 +25,6 @@ linux/arch/arm64/include/asm/atomic.h
---------------------------------------------------------------------
-spinlocks: last sync @ v3.16-rc6 (last commit: 95c4189689f9)
-
-linux/arch/arm64/include/asm/spinlock.h xen/include/asm-arm/arm64/spinlock.h
-
-Skipped:
- 5686b06 arm64: lockref: add support for lockless lockrefs using cmpxchg
- 52ea2a5 arm64: locks: introduce ticket-based spinlock implementation
-
----------------------------------------------------------------------
-
mem*: last sync @ v3.16-rc6 (last commit: d875c9b37240)
linux/arch/arm64/lib/memchr.S xen/arch/arm/arm64/lib/memchr.S
@@ -103,24 +93,6 @@ linux/arch/arm/include/asm/atomic.h
---------------------------------------------------------------------
-spinlocks: last sync: 15e7e5c1ebf5
-
-linux/arch/arm/include/asm/spinlock.h xen/include/asm-arm/arm32/spinlock.h
-
-*** Linux has switched to ticket locks but we still use bitlocks.
-
-resync to v3.14-rc7:
-
- 7c8746a ARM: 7955/1: spinlock: ensure we have a compiler barrier before sev
- 0cbad9c ARM: 7854/1: lockref: add support for lockless lockrefs using cmpxchg64
- 9bb17be ARM: locks: prefetch the destination word for write prior to strex
- 27a8479 ARM: smp_on_up: move inline asm ALT_SMP patching macro out of spinlock.
- 00efaa0 ARM: 7812/1: rwlocks: retry trylock operation if strex fails on free lo
- afa31d8 ARM: 7811/1: locks: use early clobber in arch_spin_trylock
- 73a6fdc ARM: spinlock: use inner-shareable dsb variant prior to sev instruction
-
----------------------------------------------------------------------
-
mem*: last sync @ v3.16-rc6 (last commit: d98b90ea22b0)
linux/arch/arm/lib/copy_template.S xen/arch/arm/arm32/lib/copy_template.S
--- a/xen/include/asm-arm/arm32/spinlock.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef __ASM_ARM32_SPINLOCK_H
-#define __ASM_ARM32_SPINLOCK_H
-
-static inline void dsb_sev(void)
-{
- __asm__ __volatile__ (
- "dsb\n"
- "sev\n"
- );
-}
-
-typedef struct {
- volatile unsigned int lock;
-} raw_spinlock_t;
-
-#define _RAW_SPIN_LOCK_UNLOCKED { 0 }
-
-#define _raw_spin_is_locked(x) ((x)->lock != 0)
-
-static always_inline void _raw_spin_unlock(raw_spinlock_t *lock)
-{
- ASSERT(_raw_spin_is_locked(lock));
-
- smp_mb();
-
- __asm__ __volatile__(
-" str %1, [%0]\n"
- :
- : "r" (&lock->lock), "r" (0)
- : "cc");
-
- dsb_sev();
-}
-
-static always_inline int _raw_spin_trylock(raw_spinlock_t *lock)
-{
- unsigned long contended, res;
-
- do {
- __asm__ __volatile__(
- " ldrex %0, [%2]\n"
- " teq %0, #0\n"
- " strexeq %1, %3, [%2]\n"
- " movne %1, #0\n"
- : "=&r" (contended), "=r" (res)
- : "r" (&lock->lock), "r" (1)
- : "cc");
- } while (res);
-
- if (!contended) {
- smp_mb();
- return 1;
- } else {
- return 0;
- }
-}
-
-#endif /* __ASM_SPINLOCK_H */
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- */
--- a/xen/include/asm-arm/arm64/spinlock.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Derived from Linux arch64 spinlock.h which is:
- * Copyright (C) 2012 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef __ASM_ARM64_SPINLOCK_H
-#define __ASM_ARM64_SPINLOCK_H
-
-typedef struct {
- volatile unsigned int lock;
-} raw_spinlock_t;
-
-#define _RAW_SPIN_LOCK_UNLOCKED { 0 }
-
-#define _raw_spin_is_locked(x) ((x)->lock != 0)
-
-static always_inline void _raw_spin_unlock(raw_spinlock_t *lock)
-{
- ASSERT(_raw_spin_is_locked(lock));
-
- asm volatile(
- " stlr %w1, %0\n"
- : "=Q" (lock->lock) : "r" (0) : "memory");
-}
-
-static always_inline int _raw_spin_trylock(raw_spinlock_t *lock)
-{
- unsigned int tmp;
-
- asm volatile(
- "2: ldaxr %w0, %1\n"
- " cbnz %w0, 1f\n"
- " stxr %w0, %w2, %1\n"
- " cbnz %w0, 2b\n"
- "1:\n"
- : "=&r" (tmp), "+Q" (lock->lock)
- : "r" (1)
- : "cc", "memory");
-
- return !tmp;
-}
-
-#endif /* __ASM_SPINLOCK_H */
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- */
--- a/xen/include/asm-arm/spinlock.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef __ASM_SPINLOCK_H
-#define __ASM_SPINLOCK_H
-
-#include <xen/config.h>
-#include <xen/lib.h>
-
-#if defined(CONFIG_ARM_32)
-# include <asm/arm32/spinlock.h>
-#elif defined(CONFIG_ARM_64)
-# include <asm/arm64/spinlock.h>
-#else
-# error "unknown ARM variant"
-#endif
-
-#endif /* __ASM_SPINLOCK_H */
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- */
--- a/xen/include/asm-x86/spinlock.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef __ASM_SPINLOCK_H
-#define __ASM_SPINLOCK_H
-
-#include <xen/config.h>
-#include <xen/lib.h>
-#include <asm/atomic.h>
-
-typedef struct {
- volatile s16 lock;
-} raw_spinlock_t;
-
-#define _RAW_SPIN_LOCK_UNLOCKED /*(raw_spinlock_t)*/ { 1 }
-
-#define _raw_spin_is_locked(x) ((x)->lock <= 0)
-
-static always_inline void _raw_spin_unlock(raw_spinlock_t *lock)
-{
- ASSERT(_raw_spin_is_locked(lock));
- asm volatile (
- "movw $1,%0"
- : "=m" (lock->lock) : : "memory" );
-}
-
-static always_inline int _raw_spin_trylock(raw_spinlock_t *lock)
-{
- s16 oldval;
- asm volatile (
- "xchgw %w0,%1"
- :"=r" (oldval), "=m" (lock->lock)
- :"0" ((s16)0) : "memory" );
- return (oldval > 0);
-}
-
-#endif /* __ASM_SPINLOCK_H */
--- a/xen/include/xen/spinlock.h
+++ b/xen/include/xen/spinlock.h
@@ -2,7 +2,6 @@
#define __SPINLOCK_H__
#include <asm/system.h>
-#include <asm/spinlock.h>
#ifndef NDEBUG
struct lock_debug {

View File

@ -1,141 +0,0 @@
# Commit f278fcf19ce15f7b7ee69181560b5884a5e12b66
# Date 2015-05-15 10:06:04 +0200
# Author Roger Pau Monné <roger.pau@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
introduce a helper to allocate non-contiguous memory
The allocator uses independent calls to alloc_domheap_pages in order to get
the desired amount of memory and then maps all the independent physical
addresses into a contiguous virtual address space.
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Tested-by: Julien Grall <julien.grall@citrix.com> (ARM)
Reviewed-by: Tim Deegan <tim@xen.org>
# Commit 640f891eb258563bb155e577389e8c5e6541a59a
# Date 2015-05-21 08:57:19 +0200
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
vmap: avoid hitting an ASSERT with vfree(NULL)
and unconditionally defer the vm_size() call, as it doesn't have a NULL
short circuit.
Reported-by: Wei Liu <wei.liu2@citrix.com>
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Tested-by: Wei Liu <wei.liu2@citrix.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Tim Deegan <tim@xen.org>
--- a/xen/common/vmap.c
+++ b/xen/common/vmap.c
@@ -215,4 +215,75 @@ void vunmap(const void *va)
#endif
vm_free(va);
}
+
+void *vmalloc(size_t size)
+{
+ unsigned long *mfn;
+ size_t pages, i;
+ struct page_info *pg;
+ void *va;
+
+ ASSERT(size);
+
+ pages = PFN_UP(size);
+ mfn = xmalloc_array(unsigned long, pages);
+ if ( mfn == NULL )
+ return NULL;
+
+ for ( i = 0; i < pages; i++ )
+ {
+ pg = alloc_domheap_page(NULL, 0);
+ if ( pg == NULL )
+ goto error;
+ mfn[i] = page_to_mfn(pg);
+ }
+
+ va = vmap(mfn, pages);
+ if ( va == NULL )
+ goto error;
+
+ xfree(mfn);
+ return va;
+
+ error:
+ while ( i-- )
+ free_domheap_page(mfn_to_page(mfn[i]));
+ xfree(mfn);
+ return NULL;
+}
+
+void *vzalloc(size_t size)
+{
+ void *p = vmalloc(size);
+ int i;
+
+ if ( p == NULL )
+ return NULL;
+
+ for ( i = 0; i < size; i += PAGE_SIZE )
+ clear_page(p + i);
+
+ return p;
+}
+
+void vfree(void *va)
+{
+ unsigned int i, pages;
+ struct page_info *pg;
+ PAGE_LIST_HEAD(pg_list);
+
+ if ( !va )
+ return;
+
+ pages = vm_size(va);
+ ASSERT(pages);
+
+ for ( i = 0; i < pages; i++ )
+ page_list_add(vmap_to_page(va + i * PAGE_SIZE), &pg_list);
+
+ vunmap(va);
+
+ while ( (pg = page_list_remove_head(&pg_list)) != NULL )
+ free_domheap_page(pg);
+}
#endif
--- a/xen/include/asm-arm/mm.h
+++ b/xen/include/asm-arm/mm.h
@@ -208,6 +208,8 @@ static inline void __iomem *ioremap_wc(p
#define pfn_to_paddr(pfn) ((paddr_t)(pfn) << PAGE_SHIFT)
#define paddr_to_pfn(pa) ((unsigned long)((pa) >> PAGE_SHIFT))
#define paddr_to_pdx(pa) pfn_to_pdx(paddr_to_pfn(pa))
+#define vmap_to_mfn(va) paddr_to_pfn(virt_to_maddr((vaddr_t)va))
+#define vmap_to_page(va) mfn_to_page(vmap_to_mfn(va))
/* Page-align address and convert to frame number format */
#define paddr_to_pfn_aligned(paddr) paddr_to_pfn(PAGE_ALIGN(paddr))
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -262,6 +262,8 @@ void copy_page_sse2(void *, const void *
#define pfn_to_paddr(pfn) __pfn_to_paddr(pfn)
#define paddr_to_pfn(pa) __paddr_to_pfn(pa)
#define paddr_to_pdx(pa) pfn_to_pdx(paddr_to_pfn(pa))
+#define vmap_to_mfn(va) l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va)))
+#define vmap_to_page(va) mfn_to_page(vmap_to_mfn(va))
#endif /* !defined(__ASSEMBLY__) */
--- a/xen/include/xen/vmap.h
+++ b/xen/include/xen/vmap.h
@@ -11,6 +11,9 @@ void *__vmap(const unsigned long *mfn, u
unsigned int nr, unsigned int align, unsigned int flags);
void *vmap(const unsigned long *mfn, unsigned int nr);
void vunmap(const void *);
+void *vmalloc(size_t size);
+void *vzalloc(size_t size);
+void vfree(void *va);
void __iomem *ioremap(paddr_t, size_t);

View File

@ -1,29 +0,0 @@
# Commit fed56ba0e69b251d0222ef0785cd1c1838f9e51d
# Date 2015-06-02 13:45:03 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
unmodified-drivers: tolerate IRQF_DISABLED being undefined
It's being removed in Linux 4.1.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
--- a/unmodified_drivers/linux-2.6/platform-pci/evtchn.c
+++ b/unmodified_drivers/linux-2.6/platform-pci/evtchn.c
@@ -350,11 +350,13 @@ int xen_irq_init(struct pci_dev *pdev)
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
SA_SHIRQ | SA_SAMPLE_RANDOM | SA_INTERRUPT,
#else
- IRQF_SHARED |
#ifdef IRQF_SAMPLE_RANDOM
IRQF_SAMPLE_RANDOM |
#endif
- IRQF_DISABLED,
+#ifdef IRQF_DISABLED
+ IRQF_DISABLED |
+#endif
+ IRQF_SHARED,
#endif
"xen-platform-pci", pdev);
}

View File

@ -1,158 +0,0 @@
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
# Commit 85baced14dec2fafa9fe560969dba2ae28e8bebb
# Date 2015-06-09 15:59:31 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86: adjust PV I/O emulation functions' types
admin_io_okay(), guest_io_read(), and guest_io_write() all don't need
their current "regs" parameter at all, and they don't use the vCPU
passed to them for other than obtaining its domain. Drop the former and
replace the latter by a struct domain pointer.
pci_cfg_okay() returns a boolean type, and its "write" parameter is of
boolean kind too.
All of them get called for the current vCPU (and hence current domain)
only, so name the domain parameters accordingly except in the
admin_io_okay() case, which a subsequent patch will use for simplifying
setup_io_bitmap().
Latch current->domain into a local variable in emulate_privileged_op().
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
# Commit 2d67a7a4d37a4759bcd7f2ee2d740497ad669c7d
# Date 2015-06-18 15:07:10 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86: synchronize PCI config space access decoding
Both PV and HVM logic have similar but not similar enough code here.
Synchronize the two so that
- in the HVM case we don't unconditionally try to access extended
config space
- in the PV case we pass a correct range to the XSM hook
- in the PV case we don't needlessly deny access when the operation
isn't really on PCI config space
All this along with sharing the macros HVM already had here.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Backport stripped down to just the pci_cfg_ok() adjustments.
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -1708,14 +1708,18 @@ static int admin_io_okay(
return ioports_access_permitted(v->domain, port, port + bytes - 1);
}
-static int pci_cfg_ok(struct domain *d, int write, int size)
+static bool_t pci_cfg_ok(struct domain *currd, bool_t write,
+ unsigned int start, unsigned int size)
{
uint32_t machine_bdf;
- uint16_t start, end;
- if (!is_hardware_domain(d))
+
+ if ( !is_hardware_domain(currd) )
return 0;
- machine_bdf = (d->arch.pci_cf8 >> 8) & 0xFFFF;
+ if ( !CF8_ENABLED(currd->arch.pci_cf8) )
+ return 1;
+
+ machine_bdf = CF8_BDF(currd->arch.pci_cf8);
if ( write )
{
const unsigned long *ro_map = pci_get_ro_map(0);
@@ -1723,9 +1727,9 @@ static int pci_cfg_ok(struct domain *d,
if ( ro_map && test_bit(machine_bdf, ro_map) )
return 0;
}
- start = d->arch.pci_cf8 & 0xFF;
+ start |= CF8_ADDR_LO(currd->arch.pci_cf8);
/* AMD extended configuration space access? */
- if ( (d->arch.pci_cf8 & 0x0F000000) &&
+ if ( CF8_ADDR_HI(currd->arch.pci_cf8) &&
boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 <= 0x17 )
{
@@ -1734,12 +1738,11 @@ static int pci_cfg_ok(struct domain *d,
if ( rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) )
return 0;
if ( msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT) )
- start |= (d->arch.pci_cf8 >> 16) & 0xF00;
+ start |= CF8_ADDR_HI(currd->arch.pci_cf8);
}
- end = start + size - 1;
- if (xsm_pci_config_permission(XSM_HOOK, d, machine_bdf, start, end, write))
- return 0;
- return 1;
+
+ return !xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf,
+ start, start + size - 1, write);
}
uint32_t guest_io_read(
@@ -1793,7 +1796,7 @@ uint32_t guest_io_read(
size = min(bytes, 4 - (port & 3));
if ( size == 3 )
size = 2;
- if ( pci_cfg_ok(v->domain, 0, size) )
+ if ( pci_cfg_ok(v->domain, 0, port & 3, size) )
sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size);
}
@@ -1866,7 +1869,7 @@ void guest_io_write(
size = min(bytes, 4 - (port & 3));
if ( size == 3 )
size = 2;
- if ( pci_cfg_ok(v->domain, 1, size) )
+ if ( pci_cfg_ok(v->domain, 1, port & 3, size) )
pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data);
}
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2357,11 +2357,6 @@ void hvm_vcpu_down(struct vcpu *v)
static struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
ioreq_t *p)
{
-#define CF8_BDF(cf8) (((cf8) & 0x00ffff00) >> 8)
-#define CF8_ADDR_LO(cf8) ((cf8) & 0x000000fc)
-#define CF8_ADDR_HI(cf8) (((cf8) & 0x0f000000) >> 16)
-#define CF8_ENABLED(cf8) (!!((cf8) & 0x80000000))
-
struct hvm_ioreq_server *s;
uint32_t cf8;
uint8_t type;
@@ -2446,11 +2441,6 @@ static struct hvm_ioreq_server *hvm_sele
}
return d->arch.hvm_domain.default_ioreq_server;
-
-#undef CF8_ADDR_ENABLED
-#undef CF8_ADDR_HI
-#undef CF8_ADDR_LO
-#undef CF8_BDF
}
int hvm_buffered_io_send(ioreq_t *p)
--- a/xen/include/asm-x86/pci.h
+++ b/xen/include/asm-x86/pci.h
@@ -1,6 +1,11 @@
#ifndef __X86_PCI_H__
#define __X86_PCI_H__
+#define CF8_BDF(cf8) ( ((cf8) & 0x00ffff00) >> 8)
+#define CF8_ADDR_LO(cf8) ( (cf8) & 0x000000fc)
+#define CF8_ADDR_HI(cf8) ( ((cf8) & 0x0f000000) >> 16)
+#define CF8_ENABLED(cf8) (!!((cf8) & 0x80000000))
+
#define IS_SNB_GFX(id) (id == 0x01068086 || id == 0x01168086 \
|| id == 0x01268086 || id == 0x01028086 \
|| id == 0x01128086 || id == 0x01228086 \

View File

@ -1,62 +0,0 @@
References: bsc#925466
# Commit 5cb57f4bddee1f11079e69bf43c193a8b104c476
# Date 2015-06-09 16:00:24 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
kexec: add more pages to v1 environment
Destination pages need mappings to be added to the page tables in the
v1 case (where nothing else calls machine_kexec_add_page() for them).
Further, without the tools mapping the low 1Mb (expected by at least
some Linux version), we need to do so in the hypervisor in the v1 case.
Suggested-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Tested-by: Alan Robinson <alan.robinson@ts.fujitsu.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/common/kexec.c
+++ b/xen/common/kexec.c
@@ -1003,6 +1003,24 @@ static int kexec_do_load_v1(xen_kexec_lo
if ( ret < 0 )
goto error;
+ if ( arch == EM_386 || arch == EM_X86_64 )
+ {
+ /*
+ * Ensure 0 - 1 MiB is mapped and accessible by the image.
+ *
+ * This allows access to VGA memory and the region purgatory copies
+ * in the crash case.
+ */
+ unsigned long addr;
+
+ for ( addr = 0; addr < MB(1); addr += PAGE_SIZE )
+ {
+ ret = machine_kexec_add_page(kimage, addr, addr);
+ if ( ret < 0 )
+ goto error;
+ }
+ }
+
ret = kexec_load_slot(kimage);
if ( ret < 0 )
goto error;
--- a/xen/common/kimage.c
+++ b/xen/common/kimage.c
@@ -923,6 +923,11 @@ int kimage_build_ind(struct kexec_image
ret = kimage_add_page(image, page_to_maddr(xen_page));
if ( ret < 0 )
goto done;
+
+ ret = machine_kexec_add_page(image, dest, dest);
+ if ( ret < 0 )
+ goto done;
+
dest += PAGE_SIZE;
break;
}

View File

@ -1,86 +0,0 @@
# Commit 860313f0411d2dcc6b2fd78bfb834b39d05373a6
# Date 2015-06-10 12:05:21 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/EFI: adjust EFI_MEMORY_WP handling for spec version 2.5
That flag now means cachability rather than protection, and a new flag
EFI_MEMORY_RO got added in its place.
Along with EFI_MEMORY_RO also add the two other new EFI_MEMORY_*
definitions, even if we don't need them right away.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Index: xen-4.5.1-testing/xen/common/efi/boot.c
===================================================================
--- xen-4.5.1-testing.orig/xen/common/efi/boot.c
+++ xen-4.5.1-testing/xen/common/efi/boot.c
@@ -32,6 +32,8 @@
/* Using SetVirtualAddressMap() is incompatible with kexec: */
#undef USE_SET_VIRTUAL_ADDRESS_MAP
+#define EFI_REVISION(major, minor) (((major) << 16) | (minor))
+
#define SHIM_LOCK_PROTOCOL_GUID \
{ 0x605dab50, 0xe046, 0x4300, {0xab, 0xb6, 0x3d, 0xd8, 0x10, 0xdd, 0x8b, 0x23} }
@@ -76,6 +78,7 @@ static int set_color(u32 mask, int bpp,
static bool_t match_guid(const EFI_GUID *guid1, const EFI_GUID *guid2);
static const EFI_BOOT_SERVICES *__initdata efi_bs;
+static UINT32 __initdata efi_bs_revision;
static EFI_HANDLE __initdata efi_ih;
static SIMPLE_TEXT_OUTPUT_INTERFACE *__initdata StdOut;
@@ -714,6 +717,7 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SY
efi_ih = ImageHandle;
efi_bs = SystemTable->BootServices;
+ efi_bs_revision = efi_bs->Hdr.Revision;
efi_rs = SystemTable->RuntimeServices;
efi_ct = SystemTable->ConfigurationTable;
efi_num_ct = SystemTable->NumberOfTableEntries;
@@ -1221,6 +1225,9 @@ void __init efi_init_memory(void)
prot |= _PAGE_PAT | MAP_SMALL_PAGES;
else if ( desc->Attribute & (EFI_MEMORY_UC | EFI_MEMORY_UCE) )
prot |= _PAGE_PWT | _PAGE_PCD | MAP_SMALL_PAGES;
+ else if ( efi_bs_revision >= EFI_REVISION(2, 5) &&
+ (desc->Attribute & EFI_MEMORY_WP) )
+ prot |= _PAGE_PAT | _PAGE_PWT | MAP_SMALL_PAGES;
else
{
printk(XENLOG_ERR "Unknown cachability for MFNs %#lx-%#lx%s\n",
@@ -1230,7 +1237,8 @@ void __init efi_init_memory(void)
prot |= _PAGE_PWT | _PAGE_PCD | MAP_SMALL_PAGES;
}
- if ( desc->Attribute & EFI_MEMORY_WP )
+ if ( desc->Attribute & (efi_bs_revision < EFI_REVISION(2, 5)
+ ? EFI_MEMORY_WP : EFI_MEMORY_RO) )
prot &= ~_PAGE_RW;
if ( (desc->Attribute & EFI_MEMORY_XP) && cpu_has_nx )
prot |= _PAGE_NX_BIT;
Index: xen-4.5.1-testing/xen/include/efi/efidef.h
===================================================================
--- xen-4.5.1-testing.orig/xen/include/efi/efidef.h
+++ xen-4.5.1-testing/xen/include/efi/efidef.h
@@ -156,11 +156,15 @@ typedef enum {
#define EFI_MEMORY_WT 0x0000000000000004
#define EFI_MEMORY_WB 0x0000000000000008
#define EFI_MEMORY_UCE 0x0000000000000010
+#define EFI_MEMORY_WP 0x0000000000001000
// physical memory protection on range
-#define EFI_MEMORY_WP 0x0000000000001000
#define EFI_MEMORY_RP 0x0000000000002000
#define EFI_MEMORY_XP 0x0000000000004000
+#define EFI_MEMORY_RO 0x0000000000020000
+
+#define EFI_MEMORY_NV 0x0000000000008000
+#define EFI_MEMORY_MORE_RELIABLE 0x0000000000010000
// range requires a runtime mapping
#define EFI_MEMORY_RUNTIME 0x8000000000000000

View File

@ -1,99 +0,0 @@
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
# Commit 284ffb4f9b0d5c3a33c4c5bd87645d0cc342ca96
# Date 2015-06-11 11:52:18 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/vMSI-X: support qword MMIO access
The specification explicitly provides for this, so we should have
supported this from the beginning.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -223,7 +223,7 @@ static int msixtbl_read(
unsigned int nr_entry, index;
int r = X86EMUL_UNHANDLEABLE;
- if ( len != 4 || (address & 3) )
+ if ( (len != 4 && len != 8) || (address & (len - 1)) )
return r;
rcu_read_lock(&msixtbl_rcu_lock);
@@ -241,13 +241,25 @@ static int msixtbl_read(
!acc_bit(test, entry, nr_entry, index) )
goto out;
*pval = entry->gentries[nr_entry].msi_ad[index];
+ if ( len == 8 )
+ {
+ if ( index )
+ offset = PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
+ else if ( acc_bit(test, entry, nr_entry, 1) )
+ *pval |= (u64)entry->gentries[nr_entry].msi_ad[1] << 32;
+ else
+ goto out;
+ }
}
- else
+ if ( offset == PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
{
virt = msixtbl_addr_to_virt(entry, address);
if ( !virt )
goto out;
- *pval = readl(virt);
+ if ( len == 4 )
+ *pval = readl(virt);
+ else
+ *pval |= (u64)readl(virt) << 32;
}
r = X86EMUL_OKAY;
@@ -268,7 +280,7 @@ static int msixtbl_write(struct vcpu *v,
unsigned long flags, orig;
struct irq_desc *desc;
- if ( len != 4 || (address & 3) )
+ if ( (len != 4 && len != 8) || (address & (len - 1)) )
return r;
rcu_read_lock(&msixtbl_rcu_lock);
@@ -279,16 +291,23 @@ static int msixtbl_write(struct vcpu *v,
nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
offset = address & (PCI_MSIX_ENTRY_SIZE - 1);
- if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET)
+ if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
{
+ index = offset / sizeof(uint32_t);
if ( nr_entry < MAX_MSIX_ACC_ENTRIES )
{
- index = offset / sizeof(uint32_t);
entry->gentries[nr_entry].msi_ad[index] = val;
acc_bit(set, entry, nr_entry, index);
+ if ( len == 8 && !index )
+ {
+ entry->gentries[nr_entry].msi_ad[1] = val >> 32;
+ acc_bit(set, entry, nr_entry, 1);
+ }
}
set_bit(nr_entry, &entry->table_flags);
- goto out;
+ if ( len != 8 || !index )
+ goto out;
+ val >>= 32;
}
/* Exit to device model when unmasking and address/data got modified. */
@@ -352,7 +371,8 @@ static int msixtbl_write(struct vcpu *v,
unlock:
spin_unlock_irqrestore(&desc->lock, flags);
- r = X86EMUL_OKAY;
+ if ( len == 4 )
+ r = X86EMUL_OKAY;
out:
rcu_read_unlock(&msixtbl_rcu_lock);

View File

@ -1,551 +0,0 @@
# Commit b4650e9a96d78b87ccf7deb4f74733ccfcc64db5
# Date 2015-06-15 13:22:07 +0200
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
gnttab: per-active entry locking
Introduce a per-active entry spin lock to protect active entry state
The grant table lock must be locked before acquiring (locking) an
active entry.
This is a step in reducing contention on the grant table lock, but
will only do so once the grant table lock is turned into a read-write
lock.
Based on a patch originally by Matt Wilson <msw@amazon.com>.
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
--- a/docs/misc/grant-tables.txt
+++ b/docs/misc/grant-tables.txt
@@ -63,6 +63,7 @@ is complete.
act->domid : remote domain being granted rights
act->frame : machine frame being granted
act->pin : used to hold reference counts
+ act->lock : spinlock used to serialize access to active entry state
Map tracking
~~~~~~~~~~~~
@@ -74,7 +75,46 @@ is complete.
matching map track entry is then removed, as if unmap had been invoked.
These are not used by the transfer mechanism.
map->domid : owner of the mapped frame
- map->ref_and_flags : grant reference, ro/rw, mapped for host or device access
+ map->ref : grant reference
+ map->flags : ro/rw, mapped for host or device access
+
+********************************************************************************
+ Locking
+ ~~~~~~~
+ Xen uses several locks to serialize access to the internal grant table state.
+
+ grant_table->lock : lock used to prevent readers from accessing
+ inconsistent grant table state such as current
+ version, partially initialized active table pages,
+ etc.
+ active_grant_entry->lock : spinlock used to serialize modifications to
+ active entries
+
+ The primary lock for the grant table is a spinlock. All functions
+ that access members of struct grant_table must acquire the lock
+ around critical sections.
+
+ Active entries are obtained by calling active_entry_acquire(gt, ref).
+ This function returns a pointer to the active entry after locking its
+ spinlock. The caller must hold the grant table lock for the gt in
+ question before calling active_entry_acquire(). This is because the
+ grant table can be dynamically extended via gnttab_grow_table() while
+ a domain is running and must be fully initialized. Once all access to
+ the active entry is complete, release the lock by calling
+ active_entry_release(act).
+
+ Summary of rules for locking:
+ active_entry_acquire() and active_entry_release() can only be
+ called when holding the relevant grant table's lock. I.e.:
+ spin_lock(&gt->lock);
+ act = active_entry_acquire(gt, ref);
+ ...
+ active_entry_release(act);
+ spin_unlock(&gt->lock);
+
+ Active entries cannot be acquired while holding the maptrack lock.
+ Multiple active entries can be acquired while holding the grant table
+ lock.
********************************************************************************
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -157,10 +157,13 @@ struct active_grant_entry {
in the page. */
unsigned length:16; /* For sub-page grants, the length of the
grant. */
+ spinlock_t lock; /* lock to protect access of this entry.
+ see docs/misc/grant-tables.txt for
+ locking protocol */
};
#define ACGNT_PER_PAGE (PAGE_SIZE / sizeof(struct active_grant_entry))
-#define active_entry(t, e) \
+#define _active_entry(t, e) \
((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
static inline void gnttab_flush_tlb(const struct domain *d)
@@ -188,6 +191,24 @@ nr_active_grant_frames(struct grant_tabl
return num_act_frames_from_sha_frames(nr_grant_frames(gt));
}
+static inline struct active_grant_entry *
+active_entry_acquire(struct grant_table *t, grant_ref_t e)
+{
+ struct active_grant_entry *act;
+
+ ASSERT(spin_is_locked(&t->lock));
+
+ act = &_active_entry(t, e);
+ spin_lock(&act->lock);
+
+ return act;
+}
+
+static inline void active_entry_release(struct active_grant_entry *act)
+{
+ spin_unlock(&act->lock);
+}
+
/* Check if the page has been paged out, or needs unsharing.
If rc == GNTST_okay, *page contains the page struct with a ref taken.
Caller must do put_page(*page).
@@ -505,7 +526,6 @@ static int grant_map_exists(const struct
unsigned long mfn,
unsigned int *ref_count)
{
- const struct active_grant_entry *act;
unsigned int ref, max_iter;
ASSERT(spin_is_locked(&rgt->lock));
@@ -514,18 +534,19 @@ static int grant_map_exists(const struct
nr_grant_entries(rgt));
for ( ref = *ref_count; ref < max_iter; ref++ )
{
- act = &active_entry(rgt, ref);
+ struct active_grant_entry *act;
+ bool_t exists;
- if ( !act->pin )
- continue;
+ act = active_entry_acquire(rgt, ref);
- if ( act->domid != ld->domain_id )
- continue;
+ exists = act->pin
+ && act->domid == ld->domain_id
+ && act->frame == mfn;
- if ( act->frame != mfn )
- continue;
+ active_entry_release(act);
- return 0;
+ if ( exists )
+ return 0;
}
if ( ref < nr_grant_entries(rgt) )
@@ -546,13 +567,24 @@ static void mapcount(
*wrc = *rdc = 0;
+ /*
+ * Must have the local domain's grant table lock when iterating
+ * over its maptrack entries.
+ */
+ ASSERT(spin_is_locked(&lgt->lock));
+ /*
+ * Must have the remote domain's grant table lock while counting
+ * its active entries.
+ */
+ ASSERT(spin_is_locked(&rd->grant_table->lock));
+
for ( handle = 0; handle < lgt->maptrack_limit; handle++ )
{
map = &maptrack_entry(lgt, handle);
if ( !(map->flags & (GNTMAP_device_map|GNTMAP_host_map)) ||
map->domid != rd->domain_id )
continue;
- if ( active_entry(rd->grant_table, map->ref).frame == mfn )
+ if ( _active_entry(rd->grant_table, map->ref).frame == mfn )
(map->flags & GNTMAP_readonly) ? (*rdc)++ : (*wrc)++;
}
}
@@ -639,7 +671,7 @@ __gnttab_map_grant_ref(
if ( unlikely(op->ref >= nr_grant_entries(rgt)))
PIN_FAIL(unlock_out, GNTST_bad_gntref, "Bad ref (%d).\n", op->ref);
- act = &active_entry(rgt, op->ref);
+ act = active_entry_acquire(rgt, op->ref);
shah = shared_entry_header(rgt, op->ref);
if (rgt->gt_version == 1) {
sha1 = &shared_entry_v1(rgt, op->ref);
@@ -656,7 +688,7 @@ __gnttab_map_grant_ref(
((act->domid != ld->domain_id) ||
(act->pin & 0x80808080U) != 0 ||
(act->is_sub_page)) )
- PIN_FAIL(unlock_out, GNTST_general_error,
+ PIN_FAIL(act_release_out, GNTST_general_error,
"Bad domain (%d != %d), or risk of counter overflow %08x, or subpage %d\n",
act->domid, ld->domain_id, act->pin, act->is_sub_page);
@@ -667,7 +699,7 @@ __gnttab_map_grant_ref(
if ( (rc = _set_status(rgt->gt_version, ld->domain_id,
op->flags & GNTMAP_readonly,
1, shah, act, status) ) != GNTST_okay )
- goto unlock_out;
+ goto act_release_out;
if ( !act->pin )
{
@@ -702,6 +734,7 @@ __gnttab_map_grant_ref(
cache_flags = (shah->flags & (GTF_PAT | GTF_PWT | GTF_PCD) );
+ active_entry_release(act);
spin_unlock(&rgt->lock);
/* pg may be set, with a refcount included, from __get_paged_frame */
@@ -839,7 +872,7 @@ __gnttab_map_grant_ref(
spin_lock(&rgt->lock);
- act = &active_entry(rgt, op->ref);
+ act = active_entry_acquire(rgt, op->ref);
if ( op->flags & GNTMAP_device_map )
act->pin -= (op->flags & GNTMAP_readonly) ?
@@ -856,6 +889,9 @@ __gnttab_map_grant_ref(
if ( !act->pin )
gnttab_clear_flag(_GTF_reading, status);
+ act_release_out:
+ active_entry_release(act);
+
unlock_out:
spin_unlock(&rgt->lock);
op->status = rc;
@@ -950,7 +986,7 @@ __gnttab_unmap_common(
}
op->rd = rd;
- act = &active_entry(rgt, op->map->ref);
+ act = active_entry_acquire(rgt, op->map->ref);
if ( op->frame == 0 )
{
@@ -959,7 +995,7 @@ __gnttab_unmap_common(
else
{
if ( unlikely(op->frame != act->frame) )
- PIN_FAIL(unmap_out, GNTST_general_error,
+ PIN_FAIL(act_release_out, GNTST_general_error,
"Bad frame number doesn't match gntref. (%lx != %lx)\n",
op->frame, act->frame);
if ( op->flags & GNTMAP_device_map )
@@ -978,7 +1014,7 @@ __gnttab_unmap_common(
if ( (rc = replace_grant_host_mapping(op->host_addr,
op->frame, op->new_addr,
op->flags)) < 0 )
- goto unmap_out;
+ goto act_release_out;
ASSERT(act->pin & (GNTPIN_hstw_mask | GNTPIN_hstr_mask));
op->map->flags &= ~GNTMAP_host_map;
@@ -1000,7 +1036,7 @@ __gnttab_unmap_common(
if ( err )
{
rc = GNTST_general_error;
- goto unmap_out;
+ goto act_release_out;
}
}
@@ -1008,8 +1044,11 @@ __gnttab_unmap_common(
if ( !(op->flags & GNTMAP_readonly) )
gnttab_mark_dirty(rd, op->frame);
+ act_release_out:
+ active_entry_release(act);
unmap_out:
double_gt_unlock(lgt, rgt);
+
op->status = rc;
rcu_unlock_domain(rd);
}
@@ -1042,9 +1081,9 @@ __gnttab_unmap_common_complete(struct gn
spin_lock(&rgt->lock);
if ( rgt->gt_version == 0 )
- goto unmap_out;
+ goto unlock_out;
- act = &active_entry(rgt, op->map->ref);
+ act = active_entry_acquire(rgt, op->map->ref);
sha = shared_entry_header(rgt, op->map->ref);
if ( rgt->gt_version == 1 )
@@ -1058,7 +1097,7 @@ __gnttab_unmap_common_complete(struct gn
* Suggests that __gntab_unmap_common failed early and so
* nothing further to do
*/
- goto unmap_out;
+ goto act_release_out;
}
pg = mfn_to_page(op->frame);
@@ -1082,7 +1121,7 @@ __gnttab_unmap_common_complete(struct gn
* Suggests that __gntab_unmap_common failed in
* replace_grant_host_mapping() so nothing further to do
*/
- goto unmap_out;
+ goto act_release_out;
}
if ( !is_iomem_page(op->frame) )
@@ -1103,8 +1142,11 @@ __gnttab_unmap_common_complete(struct gn
if ( act->pin == 0 )
gnttab_clear_flag(_GTF_reading, status);
- unmap_out:
+ act_release_out:
+ active_entry_release(act);
+ unlock_out:
spin_unlock(&rgt->lock);
+
if ( put_handle )
{
op->map->flags = 0;
@@ -1296,7 +1338,7 @@ gnttab_grow_table(struct domain *d, unsi
/* d's grant table lock must be held by the caller */
struct grant_table *gt = d->grant_table;
- unsigned int i;
+ unsigned int i, j;
ASSERT(req_nr_frames <= max_grant_frames);
@@ -1311,6 +1353,8 @@ gnttab_grow_table(struct domain *d, unsi
if ( (gt->active[i] = alloc_xenheap_page()) == NULL )
goto active_alloc_failed;
clear_page(gt->active[i]);
+ for ( j = 0; j < ACGNT_PER_PAGE; j++ )
+ spin_lock_init(&gt->active[i][j].lock);
}
/* Shared */
@@ -1805,7 +1849,7 @@ __release_grant_for_copy(
spin_lock(&rgt->lock);
- act = &active_entry(rgt, gref);
+ act = active_entry_acquire(rgt, gref);
sha = shared_entry_header(rgt, gref);
r_frame = act->frame;
@@ -1844,6 +1888,7 @@ __release_grant_for_copy(
released_read = 1;
}
+ active_entry_release(act);
spin_unlock(&rgt->lock);
if ( td != rd )
@@ -1905,14 +1950,14 @@ __acquire_grant_for_copy(
spin_lock(&rgt->lock);
if ( rgt->gt_version == 0 )
- PIN_FAIL(unlock_out, GNTST_general_error,
+ PIN_FAIL(gt_unlock_out, GNTST_general_error,
"remote grant table not ready\n");
if ( unlikely(gref >= nr_grant_entries(rgt)) )
- PIN_FAIL(unlock_out, GNTST_bad_gntref,
+ PIN_FAIL(gt_unlock_out, GNTST_bad_gntref,
"Bad grant reference %ld\n", gref);
- act = &active_entry(rgt, gref);
+ act = active_entry_acquire(rgt, gref);
shah = shared_entry_header(rgt, gref);
if ( rgt->gt_version == 1 )
{
@@ -1971,6 +2016,13 @@ __acquire_grant_for_copy(
PIN_FAIL(unlock_out_clear, GNTST_general_error,
"transitive grant referenced bad domain %d\n",
trans_domid);
+
+ /*
+ * __acquire_grant_for_copy() could take the lock on the
+ * remote table (if rd == td), so we have to drop the lock
+ * here and reacquire
+ */
+ active_entry_release(act);
spin_unlock(&rgt->lock);
rc = __acquire_grant_for_copy(td, trans_gref, rd->domain_id,
@@ -1978,9 +2030,12 @@ __acquire_grant_for_copy(
&trans_page_off, &trans_length, 0);
spin_lock(&rgt->lock);
+ act = active_entry_acquire(rgt, gref);
+
if ( rc != GNTST_okay ) {
__fixup_status_for_copy_pin(act, status);
rcu_unlock_domain(td);
+ active_entry_release(act);
spin_unlock(&rgt->lock);
return rc;
}
@@ -1993,6 +2048,7 @@ __acquire_grant_for_copy(
{
__fixup_status_for_copy_pin(act, status);
rcu_unlock_domain(td);
+ active_entry_release(act);
spin_unlock(&rgt->lock);
put_page(*page);
return __acquire_grant_for_copy(rd, gref, ldom, readonly,
@@ -2061,6 +2117,7 @@ __acquire_grant_for_copy(
*length = act->length;
*frame = act->frame;
+ active_entry_release(act);
spin_unlock(&rgt->lock);
return rc;
@@ -2073,7 +2130,11 @@ __acquire_grant_for_copy(
gnttab_clear_flag(_GTF_reading, status);
unlock_out:
+ active_entry_release(act);
+
+ gt_unlock_out:
spin_unlock(&rgt->lock);
+
return rc;
}
@@ -2373,7 +2434,6 @@ gnttab_set_version(XEN_GUEST_HANDLE_PARA
gnttab_set_version_t op;
struct domain *d = current->domain;
struct grant_table *gt = d->grant_table;
- struct active_grant_entry *act;
grant_entry_v1_t reserved_entries[GNTTAB_NR_RESERVED_ENTRIES];
long res;
int i;
@@ -2398,8 +2458,7 @@ gnttab_set_version(XEN_GUEST_HANDLE_PARA
{
for ( i = GNTTAB_NR_RESERVED_ENTRIES; i < nr_grant_entries(gt); i++ )
{
- act = &active_entry(gt, i);
- if ( act->pin != 0 )
+ if ( read_atomic(&_active_entry(gt, i).pin) != 0 )
{
gdprintk(XENLOG_WARNING,
"tried to change grant table version from %d to %d, but some grant entries still in use\n",
@@ -2586,7 +2645,8 @@ __gnttab_swap_grant_ref(grant_ref_t ref_
{
struct domain *d = rcu_lock_current_domain();
struct grant_table *gt = d->grant_table;
- struct active_grant_entry *act;
+ struct active_grant_entry *act_a = NULL;
+ struct active_grant_entry *act_b = NULL;
s16 rc = GNTST_okay;
spin_lock(&gt->lock);
@@ -2600,12 +2660,16 @@ __gnttab_swap_grant_ref(grant_ref_t ref_
if ( unlikely(ref_b >= nr_grant_entries(d->grant_table)))
PIN_FAIL(out, GNTST_bad_gntref, "Bad ref-b (%d).\n", ref_b);
- act = &active_entry(gt, ref_a);
- if ( act->pin )
+ /* Swapping the same ref is a no-op. */
+ if ( ref_a == ref_b )
+ goto out;
+
+ act_a = active_entry_acquire(gt, ref_a);
+ if ( act_a->pin )
PIN_FAIL(out, GNTST_eagain, "ref a %ld busy\n", (long)ref_a);
- act = &active_entry(gt, ref_b);
- if ( act->pin )
+ act_b = active_entry_acquire(gt, ref_b);
+ if ( act_b->pin )
PIN_FAIL(out, GNTST_eagain, "ref b %ld busy\n", (long)ref_b);
if ( gt->gt_version == 1 )
@@ -2632,6 +2696,10 @@ __gnttab_swap_grant_ref(grant_ref_t ref_
}
out:
+ if ( act_b != NULL )
+ active_entry_release(act_b);
+ if ( act_a != NULL )
+ active_entry_release(act_a);
spin_unlock(&gt->lock);
rcu_unlock_domain(d);
@@ -2941,7 +3009,7 @@ grant_table_create(
struct domain *d)
{
struct grant_table *t;
- int i;
+ unsigned int i, j;
if ( (t = xzalloc(struct grant_table)) == NULL )
goto no_mem_0;
@@ -2960,6 +3028,8 @@ grant_table_create(
if ( (t->active[i] = alloc_xenheap_page()) == NULL )
goto no_mem_2;
clear_page(t->active[i]);
+ for ( j = 0; j < ACGNT_PER_PAGE; j++ )
+ spin_lock_init(&t->active[i][j].lock);
}
/* Tracking of mapped foreign frames table */
@@ -3056,7 +3126,7 @@ gnttab_release_mappings(
rgt = rd->grant_table;
spin_lock(&rgt->lock);
- act = &active_entry(rgt, ref);
+ act = active_entry_acquire(rgt, ref);
sha = shared_entry_header(rgt, ref);
if (rgt->gt_version == 1)
status = &sha->flags;
@@ -3114,6 +3184,7 @@ gnttab_release_mappings(
if ( act->pin == 0 )
gnttab_clear_flag(_GTF_reading, status);
+ active_entry_release(act);
spin_unlock(&rgt->lock);
rcu_unlock_domain(rd);
@@ -3176,9 +3247,12 @@ static void gnttab_usage_print(struct do
uint16_t status;
uint64_t frame;
- act = &active_entry(gt, ref);
+ act = active_entry_acquire(gt, ref);
if ( !act->pin )
+ {
+ active_entry_release(act);
continue;
+ }
sha = shared_entry_header(gt, ref);
@@ -3208,6 +3282,7 @@ static void gnttab_usage_print(struct do
printk("[%3d] %5d 0x%06lx 0x%08x %5d 0x%06"PRIx64" 0x%02x\n",
ref, act->domid, act->frame, act->pin,
sha->domid, frame, status);
+ active_entry_release(act);
}
out:

View File

@ -1,86 +0,0 @@
# Commit 5a9899ddc42040e139233a6b1f0f65f3b65eda6d
# Date 2015-06-15 13:23:34 +0200
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
gnttab: introduce maptrack lock
Split grant table lock into two separate locks. One to protect
maptrack free list (maptrack_lock) and one for everything else (lock).
Based on a patch originally by Matt Wilson <msw@amazon.com>.
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
--- a/docs/misc/grant-tables.txt
+++ b/docs/misc/grant-tables.txt
@@ -87,6 +87,7 @@ is complete.
inconsistent grant table state such as current
version, partially initialized active table pages,
etc.
+ grant_table->maptrack_lock : spinlock used to protect the maptrack free list
active_grant_entry->lock : spinlock used to serialize modifications to
active entries
@@ -94,6 +95,9 @@ is complete.
that access members of struct grant_table must acquire the lock
around critical sections.
+ The maptrack free list is protected by its own spinlock. The maptrack
+ lock may be locked while holding the grant table lock.
+
Active entries are obtained by calling active_entry_acquire(gt, ref).
This function returns a pointer to the active entry after locking its
spinlock. The caller must hold the grant table lock for the gt in
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -288,10 +288,10 @@ static inline void
put_maptrack_handle(
struct grant_table *t, int handle)
{
- spin_lock(&t->lock);
+ spin_lock(&t->maptrack_lock);
maptrack_entry(t, handle).ref = t->maptrack_head;
t->maptrack_head = handle;
- spin_unlock(&t->lock);
+ spin_unlock(&t->maptrack_lock);
}
static inline int
@@ -303,7 +303,7 @@ get_maptrack_handle(
struct grant_mapping *new_mt;
unsigned int new_mt_limit, nr_frames;
- spin_lock(&lgt->lock);
+ spin_lock(&lgt->maptrack_lock);
while ( unlikely((handle = __get_maptrack_handle(lgt)) == -1) )
{
@@ -332,7 +332,7 @@ get_maptrack_handle(
nr_frames + 1);
}
- spin_unlock(&lgt->lock);
+ spin_unlock(&lgt->maptrack_lock);
return handle;
}
@@ -3016,6 +3016,7 @@ grant_table_create(
/* Simple stuff. */
spin_lock_init(&t->lock);
+ spin_lock_init(&t->maptrack_lock);
t->nr_grant_frames = INITIAL_NR_GRANT_FRAMES;
/* Active grant table. */
--- a/xen/include/xen/grant_table.h
+++ b/xen/include/xen/grant_table.h
@@ -82,6 +82,8 @@ struct grant_table {
struct grant_mapping **maptrack;
unsigned int maptrack_head;
unsigned int maptrack_limit;
+ /* Lock protecting the maptrack page list, head, and limit */
+ spinlock_t maptrack_lock;
/* Lock protecting updates to active and shared grant tables. */
spinlock_t lock;
/* The defined versions are 1 and 2. Set to 0 if we don't know

View File

@ -1,733 +0,0 @@
# Commit 40de9fffb4cc0b0485aa3391d72e2220b8e1ce12
# Date 2015-06-15 13:25:20 +0200
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
gnttab: make the grant table lock a read-write lock
In combination with the per-active entry locks, the grant table lock
can be made a read-write lock since the majority of cases only the
read lock is required. The grant table read lock protects against
changes to the table version or size (which are done with the write
lock held).
The write lock is also required when two active entries must be
acquired.
The double lock is still required when updating IOMMU page tables.
With the lock contention being only on the maptrack lock (unless IOMMU
updates are required), performance and scalability is improved.
Based on a patch originally by Matt Wilson <msw@amazon.com>.
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
--- a/docs/misc/grant-tables.txt
+++ b/docs/misc/grant-tables.txt
@@ -83,7 +83,7 @@ is complete.
~~~~~~~
Xen uses several locks to serialize access to the internal grant table state.
- grant_table->lock : lock used to prevent readers from accessing
+ grant_table->lock : rwlock used to prevent readers from accessing
inconsistent grant table state such as current
version, partially initialized active table pages,
etc.
@@ -91,34 +91,43 @@ is complete.
active_grant_entry->lock : spinlock used to serialize modifications to
active entries
- The primary lock for the grant table is a spinlock. All functions
- that access members of struct grant_table must acquire the lock
- around critical sections.
+ The primary lock for the grant table is a read/write spinlock. All
+ functions that access members of struct grant_table must acquire a
+ read lock around critical sections. Any modification to the members
+ of struct grant_table (e.g., nr_status_frames, nr_grant_frames,
+ active frames, etc.) must only be made if the write lock is
+ held. These elements are read-mostly, and read critical sections can
+ be large, which makes a rwlock a good choice.
The maptrack free list is protected by its own spinlock. The maptrack
lock may be locked while holding the grant table lock.
Active entries are obtained by calling active_entry_acquire(gt, ref).
This function returns a pointer to the active entry after locking its
- spinlock. The caller must hold the grant table lock for the gt in
- question before calling active_entry_acquire(). This is because the
- grant table can be dynamically extended via gnttab_grow_table() while
- a domain is running and must be fully initialized. Once all access to
- the active entry is complete, release the lock by calling
- active_entry_release(act).
+ spinlock. The caller must hold the grant table read lock before
+ calling active_entry_acquire(). This is because the grant table can
+ be dynamically extended via gnttab_grow_table() while a domain is
+ running and must be fully initialized. Once all access to the active
+ entry is complete, release the lock by calling active_entry_release(act).
Summary of rules for locking:
active_entry_acquire() and active_entry_release() can only be
- called when holding the relevant grant table's lock. I.e.:
- spin_lock(&gt->lock);
+ called when holding the relevant grant table's read lock. I.e.:
+ read_lock(&gt->lock);
act = active_entry_acquire(gt, ref);
...
active_entry_release(act);
- spin_unlock(&gt->lock);
+ read_unlock(&gt->lock);
Active entries cannot be acquired while holding the maptrack lock.
Multiple active entries can be acquired while holding the grant table
- lock.
+ _write_ lock.
+
+ Maptrack entries are protected by the corresponding active entry
+ lock. As an exception, new maptrack entries may be populated without
+ holding the lock, provided the flags field is written last. This
+ requires any maptrack entry user validates the flags field as
+ non-zero first.
********************************************************************************
--- a/xen/arch/arm/mm.c
+++ b/xen/arch/arm/mm.c
@@ -1037,7 +1037,7 @@ int xenmem_add_to_physmap_one(
switch ( space )
{
case XENMAPSPACE_grant_table:
- spin_lock(&d->grant_table->lock);
+ write_lock(&d->grant_table->lock);
if ( d->grant_table->gt_version == 0 )
d->grant_table->gt_version = 1;
@@ -1067,7 +1067,7 @@ int xenmem_add_to_physmap_one(
t = p2m_ram_rw;
- spin_unlock(&d->grant_table->lock);
+ write_unlock(&d->grant_table->lock);
break;
case XENMAPSPACE_shared_info:
if ( idx != 0 )
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -4595,7 +4595,7 @@ int xenmem_add_to_physmap_one(
mfn = virt_to_mfn(d->shared_info);
break;
case XENMAPSPACE_grant_table:
- spin_lock(&d->grant_table->lock);
+ write_lock(&d->grant_table->lock);
if ( d->grant_table->gt_version == 0 )
d->grant_table->gt_version = 1;
@@ -4617,7 +4617,7 @@ int xenmem_add_to_physmap_one(
mfn = virt_to_mfn(d->grant_table->shared_raw[idx]);
}
- spin_unlock(&d->grant_table->lock);
+ write_unlock(&d->grant_table->lock);
break;
case XENMAPSPACE_gmfn_range:
case XENMAPSPACE_gmfn:
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -196,7 +196,7 @@ active_entry_acquire(struct grant_table
{
struct active_grant_entry *act;
- ASSERT(spin_is_locked(&t->lock));
+ ASSERT(rw_is_locked(&t->lock));
act = &_active_entry(t, e);
spin_lock(&act->lock);
@@ -252,25 +252,29 @@ static int __get_paged_frame(unsigned lo
static inline void
double_gt_lock(struct grant_table *lgt, struct grant_table *rgt)
{
+ /*
+ * See mapcount() for why the write lock is also required for the
+ * remote domain.
+ */
if ( lgt < rgt )
{
- spin_lock(&lgt->lock);
- spin_lock(&rgt->lock);
+ write_lock(&lgt->lock);
+ write_lock(&rgt->lock);
}
else
{
if ( lgt != rgt )
- spin_lock(&rgt->lock);
- spin_lock(&lgt->lock);
+ write_lock(&rgt->lock);
+ write_lock(&lgt->lock);
}
}
static inline void
double_gt_unlock(struct grant_table *lgt, struct grant_table *rgt)
{
- spin_unlock(&lgt->lock);
+ write_unlock(&lgt->lock);
if ( lgt != rgt )
- spin_unlock(&rgt->lock);
+ write_unlock(&rgt->lock);
}
static inline int
@@ -528,7 +532,7 @@ static int grant_map_exists(const struct
{
unsigned int ref, max_iter;
- ASSERT(spin_is_locked(&rgt->lock));
+ ASSERT(rw_is_locked(&rgt->lock));
max_iter = min(*ref_count + (1 << GNTTABOP_CONTINUATION_ARG_SHIFT),
nr_grant_entries(rgt));
@@ -568,15 +572,15 @@ static void mapcount(
*wrc = *rdc = 0;
/*
- * Must have the local domain's grant table lock when iterating
- * over its maptrack entries.
+ * Must have the local domain's grant table write lock when
+ * iterating over its maptrack entries.
*/
- ASSERT(spin_is_locked(&lgt->lock));
+ ASSERT(rw_is_write_locked(&lgt->lock));
/*
- * Must have the remote domain's grant table lock while counting
- * its active entries.
+ * Must have the remote domain's grant table write lock while
+ * counting its active entries.
*/
- ASSERT(spin_is_locked(&rd->grant_table->lock));
+ ASSERT(rw_is_write_locked(&rd->grant_table->lock));
for ( handle = 0; handle < lgt->maptrack_limit; handle++ )
{
@@ -616,6 +620,7 @@ __gnttab_map_grant_ref(
grant_entry_v2_t *sha2;
grant_entry_header_t *shah;
uint16_t *status;
+ bool_t need_iommu;
led = current;
ld = led->domain;
@@ -661,7 +666,7 @@ __gnttab_map_grant_ref(
}
rgt = rd->grant_table;
- spin_lock(&rgt->lock);
+ read_lock(&rgt->lock);
if ( rgt->gt_version == 0 )
PIN_FAIL(unlock_out, GNTST_general_error,
@@ -735,7 +740,7 @@ __gnttab_map_grant_ref(
cache_flags = (shah->flags & (GTF_PAT | GTF_PWT | GTF_PCD) );
active_entry_release(act);
- spin_unlock(&rgt->lock);
+ read_unlock(&rgt->lock);
/* pg may be set, with a refcount included, from __get_paged_frame */
if ( !pg )
@@ -811,12 +816,14 @@ __gnttab_map_grant_ref(
goto undo_out;
}
- double_gt_lock(lgt, rgt);
-
- if ( gnttab_need_iommu_mapping(ld) )
+ need_iommu = gnttab_need_iommu_mapping(ld);
+ if ( need_iommu )
{
unsigned int wrc, rdc;
int err = 0;
+
+ double_gt_lock(lgt, rgt);
+
/* We're not translated, so we know that gmfns and mfns are
the same things, so the IOMMU entry is always 1-to-1. */
mapcount(lgt, rd, frame, &wrc, &rdc);
@@ -842,12 +849,22 @@ __gnttab_map_grant_ref(
TRACE_1D(TRC_MEM_PAGE_GRANT_MAP, op->dom);
+ /*
+ * All maptrack entry users check mt->flags first before using the
+ * other fields so just ensure the flags field is stored last.
+ *
+ * However, if gnttab_need_iommu_mapping() then this would race
+ * with a concurrent mapcount() call (on an unmap, for example)
+ * and a lock is required.
+ */
mt = &maptrack_entry(lgt, handle);
mt->domid = op->dom;
mt->ref = op->ref;
- mt->flags = op->flags;
+ wmb();
+ write_atomic(&mt->flags, op->flags);
- double_gt_unlock(lgt, rgt);
+ if ( need_iommu )
+ double_gt_unlock(lgt, rgt);
op->dev_bus_addr = (u64)frame << PAGE_SHIFT;
op->handle = handle;
@@ -870,7 +887,7 @@ __gnttab_map_grant_ref(
put_page(pg);
}
- spin_lock(&rgt->lock);
+ read_lock(&rgt->lock);
act = active_entry_acquire(rgt, op->ref);
@@ -893,7 +910,7 @@ __gnttab_map_grant_ref(
active_entry_release(act);
unlock_out:
- spin_unlock(&rgt->lock);
+ read_unlock(&rgt->lock);
op->status = rc;
put_maptrack_handle(lgt, handle);
rcu_unlock_domain(rd);
@@ -943,18 +960,19 @@ __gnttab_unmap_common(
}
op->map = &maptrack_entry(lgt, op->handle);
- spin_lock(&lgt->lock);
- if ( unlikely(!op->map->flags) )
+ read_lock(&lgt->lock);
+
+ if ( unlikely(!read_atomic(&op->map->flags)) )
{
- spin_unlock(&lgt->lock);
+ read_unlock(&lgt->lock);
gdprintk(XENLOG_INFO, "Zero flags for handle (%d).\n", op->handle);
op->status = GNTST_bad_handle;
return;
}
dom = op->map->domid;
- spin_unlock(&lgt->lock);
+ read_unlock(&lgt->lock);
if ( unlikely((rd = rcu_lock_domain_by_id(dom)) == NULL) )
{
@@ -975,9 +993,10 @@ __gnttab_unmap_common(
TRACE_1D(TRC_MEM_PAGE_GRANT_UNMAP, dom);
rgt = rd->grant_table;
- double_gt_lock(lgt, rgt);
- op->flags = op->map->flags;
+ read_lock(&rgt->lock);
+
+ op->flags = read_atomic(&op->map->flags);
if ( unlikely(!op->flags) || unlikely(op->map->domid != dom) )
{
gdprintk(XENLOG_WARNING, "Unstable handle %u\n", op->handle);
@@ -1024,31 +1043,34 @@ __gnttab_unmap_common(
act->pin -= GNTPIN_hstw_inc;
}
- if ( gnttab_need_iommu_mapping(ld) )
+ act_release_out:
+ active_entry_release(act);
+ unmap_out:
+ read_unlock(&rgt->lock);
+
+ if ( rc == GNTST_okay && gnttab_need_iommu_mapping(ld) )
{
unsigned int wrc, rdc;
int err = 0;
+
+ double_gt_lock(lgt, rgt);
+
mapcount(lgt, rd, op->frame, &wrc, &rdc);
if ( (wrc + rdc) == 0 )
err = iommu_unmap_page(ld, op->frame);
else if ( wrc == 0 )
err = iommu_map_page(ld, op->frame, op->frame, IOMMUF_readable);
+
+ double_gt_unlock(lgt, rgt);
+
if ( err )
- {
rc = GNTST_general_error;
- goto act_release_out;
- }
}
/* If just unmapped a writable mapping, mark as dirtied */
- if ( !(op->flags & GNTMAP_readonly) )
+ if ( rc == GNTST_okay && !(op->flags & GNTMAP_readonly) )
gnttab_mark_dirty(rd, op->frame);
- act_release_out:
- active_entry_release(act);
- unmap_out:
- double_gt_unlock(lgt, rgt);
-
op->status = rc;
rcu_unlock_domain(rd);
}
@@ -1078,8 +1100,8 @@ __gnttab_unmap_common_complete(struct gn
rcu_lock_domain(rd);
rgt = rd->grant_table;
- spin_lock(&rgt->lock);
+ read_lock(&rgt->lock);
if ( rgt->gt_version == 0 )
goto unlock_out;
@@ -1145,7 +1167,7 @@ __gnttab_unmap_common_complete(struct gn
act_release_out:
active_entry_release(act);
unlock_out:
- spin_unlock(&rgt->lock);
+ read_unlock(&rgt->lock);
if ( put_handle )
{
@@ -1332,11 +1354,13 @@ gnttab_unpopulate_status_frames(struct d
gt->nr_status_frames = 0;
}
+/*
+ * Grow the grant table. The caller must hold the grant table's
+ * write lock before calling this function.
+ */
int
gnttab_grow_table(struct domain *d, unsigned int req_nr_frames)
{
- /* d's grant table lock must be held by the caller */
-
struct grant_table *gt = d->grant_table;
unsigned int i, j;
@@ -1442,7 +1466,7 @@ gnttab_setup_table(
}
gt = d->grant_table;
- spin_lock(&gt->lock);
+ write_lock(&gt->lock);
if ( gt->gt_version == 0 )
gt->gt_version = 1;
@@ -1470,7 +1494,7 @@ gnttab_setup_table(
}
out3:
- spin_unlock(&gt->lock);
+ write_unlock(&gt->lock);
out2:
rcu_unlock_domain(d);
out1:
@@ -1512,13 +1536,13 @@ gnttab_query_size(
goto query_out_unlock;
}
- spin_lock(&d->grant_table->lock);
+ read_lock(&d->grant_table->lock);
op.nr_frames = nr_grant_frames(d->grant_table);
op.max_nr_frames = max_grant_frames;
op.status = GNTST_okay;
- spin_unlock(&d->grant_table->lock);
+ read_unlock(&d->grant_table->lock);
query_out_unlock:
@@ -1544,7 +1568,7 @@ gnttab_prepare_for_transfer(
union grant_combo scombo, prev_scombo, new_scombo;
int retries = 0;
- spin_lock(&rgt->lock);
+ read_lock(&rgt->lock);
if ( rgt->gt_version == 0 )
{
@@ -1595,11 +1619,11 @@ gnttab_prepare_for_transfer(
scombo = prev_scombo;
}
- spin_unlock(&rgt->lock);
+ read_unlock(&rgt->lock);
return 1;
fail:
- spin_unlock(&rgt->lock);
+ read_unlock(&rgt->lock);
return 0;
}
@@ -1614,6 +1638,7 @@ gnttab_transfer(
struct gnttab_transfer gop;
unsigned long mfn;
unsigned int max_bitsize;
+ struct active_grant_entry *act;
for ( i = 0; i < count; i++ )
{
@@ -1791,7 +1816,8 @@ gnttab_transfer(
TRACE_1D(TRC_MEM_PAGE_GRANT_TRANSFER, e->domain_id);
/* Tell the guest about its new page frame. */
- spin_lock(&e->grant_table->lock);
+ read_lock(&e->grant_table->lock);
+ act = active_entry_acquire(e->grant_table, gop.ref);
if ( e->grant_table->gt_version == 1 )
{
@@ -1809,7 +1835,8 @@ gnttab_transfer(
shared_entry_header(e->grant_table, gop.ref)->flags |=
GTF_transfer_completed;
- spin_unlock(&e->grant_table->lock);
+ active_entry_release(act);
+ read_unlock(&e->grant_table->lock);
rcu_unlock_domain(e);
@@ -1847,7 +1874,7 @@ __release_grant_for_copy(
released_read = 0;
released_write = 0;
- spin_lock(&rgt->lock);
+ read_lock(&rgt->lock);
act = active_entry_acquire(rgt, gref);
sha = shared_entry_header(rgt, gref);
@@ -1889,7 +1916,7 @@ __release_grant_for_copy(
}
active_entry_release(act);
- spin_unlock(&rgt->lock);
+ read_unlock(&rgt->lock);
if ( td != rd )
{
@@ -1947,7 +1974,7 @@ __acquire_grant_for_copy(
*page = NULL;
- spin_lock(&rgt->lock);
+ read_lock(&rgt->lock);
if ( rgt->gt_version == 0 )
PIN_FAIL(gt_unlock_out, GNTST_general_error,
@@ -2023,20 +2050,20 @@ __acquire_grant_for_copy(
* here and reacquire
*/
active_entry_release(act);
- spin_unlock(&rgt->lock);
+ read_unlock(&rgt->lock);
rc = __acquire_grant_for_copy(td, trans_gref, rd->domain_id,
readonly, &grant_frame, page,
&trans_page_off, &trans_length, 0);
- spin_lock(&rgt->lock);
+ read_lock(&rgt->lock);
act = active_entry_acquire(rgt, gref);
if ( rc != GNTST_okay ) {
__fixup_status_for_copy_pin(act, status);
rcu_unlock_domain(td);
active_entry_release(act);
- spin_unlock(&rgt->lock);
+ read_unlock(&rgt->lock);
return rc;
}
@@ -2049,7 +2076,7 @@ __acquire_grant_for_copy(
__fixup_status_for_copy_pin(act, status);
rcu_unlock_domain(td);
active_entry_release(act);
- spin_unlock(&rgt->lock);
+ read_unlock(&rgt->lock);
put_page(*page);
return __acquire_grant_for_copy(rd, gref, ldom, readonly,
frame, page, page_off, length,
@@ -2118,7 +2145,7 @@ __acquire_grant_for_copy(
*frame = act->frame;
active_entry_release(act);
- spin_unlock(&rgt->lock);
+ read_unlock(&rgt->lock);
return rc;
unlock_out_clear:
@@ -2133,7 +2160,7 @@ __acquire_grant_for_copy(
active_entry_release(act);
gt_unlock_out:
- spin_unlock(&rgt->lock);
+ read_unlock(&rgt->lock);
return rc;
}
@@ -2449,7 +2476,7 @@ gnttab_set_version(XEN_GUEST_HANDLE_PARA
if ( gt->gt_version == op.version )
goto out;
- spin_lock(&gt->lock);
+ write_lock(&gt->lock);
/* Make sure that the grant table isn't currently in use when we
change the version number, except for the first 8 entries which
are allowed to be in use (xenstore/xenconsole keeps them mapped).
@@ -2534,7 +2561,7 @@ gnttab_set_version(XEN_GUEST_HANDLE_PARA
gt->gt_version = op.version;
out_unlock:
- spin_unlock(&gt->lock);
+ write_unlock(&gt->lock);
out:
op.version = gt->gt_version;
@@ -2590,7 +2617,7 @@ gnttab_get_status_frames(XEN_GUEST_HANDL
op.status = GNTST_okay;
- spin_lock(&gt->lock);
+ read_lock(&gt->lock);
for ( i = 0; i < op.nr_frames; i++ )
{
@@ -2599,7 +2626,7 @@ gnttab_get_status_frames(XEN_GUEST_HANDL
op.status = GNTST_bad_virt_addr;
}
- spin_unlock(&gt->lock);
+ read_unlock(&gt->lock);
out2:
rcu_unlock_domain(d);
out1:
@@ -2649,7 +2676,7 @@ __gnttab_swap_grant_ref(grant_ref_t ref_
struct active_grant_entry *act_b = NULL;
s16 rc = GNTST_okay;
- spin_lock(&gt->lock);
+ write_lock(&gt->lock);
if ( gt->gt_version == 0 )
PIN_FAIL(out, GNTST_general_error, "grant table not yet set up\n");
@@ -2700,7 +2727,7 @@ out:
active_entry_release(act_b);
if ( act_a != NULL )
active_entry_release(act_a);
- spin_unlock(&gt->lock);
+ write_unlock(&gt->lock);
rcu_unlock_domain(d);
@@ -2771,12 +2798,12 @@ static int __gnttab_cache_flush(gnttab_c
if ( d != owner )
{
- spin_lock(&owner->grant_table->lock);
+ read_lock(&owner->grant_table->lock);
ret = grant_map_exists(d, owner->grant_table, mfn, ref_count);
if ( ret != 0 )
{
- spin_unlock(&owner->grant_table->lock);
+ read_unlock(&owner->grant_table->lock);
rcu_unlock_domain(d);
put_page(page);
return ret;
@@ -2796,7 +2823,7 @@ static int __gnttab_cache_flush(gnttab_c
ret = 0;
if ( d != owner )
- spin_unlock(&owner->grant_table->lock);
+ read_unlock(&owner->grant_table->lock);
unmap_domain_page(v);
put_page(page);
@@ -3015,7 +3042,7 @@ grant_table_create(
goto no_mem_0;
/* Simple stuff. */
- spin_lock_init(&t->lock);
+ rwlock_init(&t->lock);
spin_lock_init(&t->maptrack_lock);
t->nr_grant_frames = INITIAL_NR_GRANT_FRAMES;
@@ -3125,7 +3152,7 @@ gnttab_release_mappings(
}
rgt = rd->grant_table;
- spin_lock(&rgt->lock);
+ read_lock(&rgt->lock);
act = active_entry_acquire(rgt, ref);
sha = shared_entry_header(rgt, ref);
@@ -3186,7 +3213,7 @@ gnttab_release_mappings(
gnttab_clear_flag(_GTF_reading, status);
active_entry_release(act);
- spin_unlock(&rgt->lock);
+ read_unlock(&rgt->lock);
rcu_unlock_domain(rd);
@@ -3234,7 +3261,7 @@ static void gnttab_usage_print(struct do
printk(" -------- active -------- -------- shared --------\n");
printk("[ref] localdom mfn pin localdom gmfn flags\n");
- spin_lock(&gt->lock);
+ read_lock(&gt->lock);
if ( gt->gt_version == 0 )
goto out;
@@ -3287,7 +3314,7 @@ static void gnttab_usage_print(struct do
}
out:
- spin_unlock(&gt->lock);
+ read_unlock(&gt->lock);
if ( first )
printk("grant-table for remote domain:%5d ... "
--- a/xen/include/xen/grant_table.h
+++ b/xen/include/xen/grant_table.h
@@ -64,6 +64,11 @@ struct grant_mapping {
/* Per-domain grant information. */
struct grant_table {
+ /*
+ * Lock protecting updates to grant table state (version, active
+ * entry list, etc.)
+ */
+ rwlock_t lock;
/* Table size. Number of frames shared with guest */
unsigned int nr_grant_frames;
/* Shared grant table (see include/public/grant_table.h). */
@@ -84,8 +89,6 @@ struct grant_table {
unsigned int maptrack_limit;
/* Lock protecting the maptrack page list, head, and limit */
spinlock_t maptrack_lock;
- /* Lock protecting updates to active and shared grant tables. */
- spinlock_t lock;
/* The defined versions are 1 and 2. Set to 0 if we don't know
what version to use yet. */
unsigned gt_version;
@@ -103,7 +106,7 @@ gnttab_release_mappings(
struct domain *d);
/* Increase the size of a domain's grant table.
- * Caller must hold d's grant table lock.
+ * Caller must hold d's grant table write lock.
*/
int
gnttab_grow_table(struct domain *d, unsigned int req_nr_frames);

View File

@ -1,47 +0,0 @@
# Commit a622b5ade2bdf79ad95e6088a4041e75253c43f3
# Date 2015-06-16 12:30:16 +0200
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
evtchn: factor out freeing an event channel
We're going to want to free an event channel from two places. Factor out
the code into a free_evtchn() function.
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -194,6 +194,17 @@ static int get_free_port(struct domain *
return port;
}
+static void free_evtchn(struct domain *d, struct evtchn *chn)
+{
+ /* Clear pending event to avoid unexpected behavior on re-bind. */
+ evtchn_port_clear_pending(d, chn);
+
+ /* Reset binding to vcpu0 when the channel is freed. */
+ chn->state = ECS_FREE;
+ chn->notify_vcpu_id = 0;
+
+ xsm_evtchn_close_post(chn);
+}
static long evtchn_alloc_unbound(evtchn_alloc_unbound_t *alloc)
{
@@ -568,14 +579,7 @@ static long __evtchn_close(struct domain
BUG();
}
- /* Clear pending event to avoid unexpected behavior on re-bind. */
- evtchn_port_clear_pending(d1, chn1);
-
- /* Reset binding to vcpu0 when the channel is freed. */
- chn1->state = ECS_FREE;
- chn1->notify_vcpu_id = 0;
-
- xsm_evtchn_close_post(chn1);
+ free_evtchn(d1, chn1);
out:
if ( d2 != NULL )

View File

@ -1,63 +0,0 @@
# Commit 01280dc19cf3da089f98faf4f524b54b5a191df0
# Date 2015-06-18 14:53:23 +0200
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
evtchn: simplify port_is_valid()
By keeping a count of the number of currently valid event channels,
port_is_valid() can be simplified.
d->valid_evtchns is only increased (while holding d->event_lock), so
port_is_valid() may be safely called without taking the lock (this
will be useful later).
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -191,6 +191,8 @@ static int get_free_port(struct domain *
return -ENOMEM;
bucket_from_port(d, port) = chn;
+ write_atomic(&d->valid_evtchns, d->valid_evtchns + EVTCHNS_PER_BUCKET);
+
return port;
}
@@ -1264,6 +1266,7 @@ int evtchn_init(struct domain *d)
d->evtchn = alloc_evtchn_bucket(d, 0);
if ( !d->evtchn )
return -ENOMEM;
+ d->valid_evtchns = EVTCHNS_PER_BUCKET;
spin_lock_init(&d->event_lock);
if ( get_free_port(d) != 0 )
--- a/xen/include/xen/event.h
+++ b/xen/include/xen/event.h
@@ -90,11 +90,7 @@ static inline bool_t port_is_valid(struc
{
if ( p >= d->max_evtchns )
return 0;
- if ( !d->evtchn )
- return 0;
- if ( p < EVTCHNS_PER_BUCKET )
- return 1;
- return group_from_port(d, p) != NULL && bucket_from_port(d, p) != NULL;
+ return p < read_atomic(&d->valid_evtchns);
}
static inline struct evtchn *evtchn_from_port(struct domain *d, unsigned int p)
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -339,8 +339,9 @@ struct domain
/* Event channel information. */
struct evtchn *evtchn; /* first bucket only */
struct evtchn **evtchn_group[NR_EVTCHN_GROUPS]; /* all other buckets */
- unsigned int max_evtchns;
- unsigned int max_evtchn_port;
+ unsigned int max_evtchns; /* number supported by ABI */
+ unsigned int max_evtchn_port; /* max permitted port number */
+ unsigned int valid_evtchns; /* number of allocated event channels */
spinlock_t event_lock;
const struct evtchn_port_ops *evtchn_port_ops;
struct evtchn_fifo_domain *evtchn_fifo;

View File

@ -1,32 +0,0 @@
# Commit e156654d4eb2fdeb524e6b40838767a5dc918966
# Date 2015-06-18 14:54:25 +0200
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
evtchn: remove the locking when unmasking an event channel
The event channel lock is no longer required to check if the port is
valid.
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -931,8 +931,6 @@ int evtchn_unmask(unsigned int port)
struct domain *d = current->domain;
struct evtchn *evtchn;
- ASSERT(spin_is_locked(&d->event_lock));
-
if ( unlikely(!port_is_valid(d, port)) )
return -EINVAL;
@@ -1099,9 +1097,7 @@ long do_event_channel_op(int cmd, XEN_GU
struct evtchn_unmask unmask;
if ( copy_from_guest(&unmask, arg, 1) != 0 )
return -EFAULT;
- spin_lock(&current->domain->event_lock);
rc = evtchn_unmask(unmask.port);
- spin_unlock(&current->domain->event_lock);
break;
}

View File

@ -1,287 +0,0 @@
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
# Commit 236e13ce60e1c0eb0535ad258e74a3789bc0d074
# Date 2015-06-19 10:58:45 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/MSI-X: cleanup
- __pci_enable_msix() now checks that an MSI-X capability was actually
found
- pass "pos" to msix_capability_init() as both callers already know it
(and hence there's no need to re-obtain it)
- call __pci_disable_msi{,x}() directly instead of via
pci_disable_msi() from __pci_enable_msi{x,}() state validation paths
- use msix_control_reg() instead of open coding it
- log message adjustments
- coding style corrections
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -35,6 +35,8 @@
static s8 __read_mostly use_msi = -1;
boolean_param("msi", use_msi);
+static void __pci_disable_msix(struct msi_desc *);
+
/* bitmap indicate which fixed map is free */
static DEFINE_SPINLOCK(msix_fixmap_lock);
static DECLARE_BITMAP(msix_fixmap_pages, FIX_MSIX_MAX_PAGES);
@@ -129,12 +131,14 @@ void msi_compose_msg(unsigned vector, co
unsigned dest;
memset(msg, 0, sizeof(*msg));
- if ( !cpumask_intersects(cpu_mask, &cpu_online_map) ) {
+ if ( !cpumask_intersects(cpu_mask, &cpu_online_map) )
+ {
dprintk(XENLOG_ERR,"%s, compose msi message error!!\n", __func__);
return;
}
- if ( vector ) {
+ if ( vector )
+ {
cpumask_t *mask = this_cpu(scratch_mask);
cpumask_and(mask, cpu_mask, &cpu_online_map);
@@ -195,8 +199,7 @@ static void read_msi_msg(struct msi_desc
}
case PCI_CAP_ID_MSIX:
{
- void __iomem *base;
- base = entry->mask_base;
+ void __iomem *base = entry->mask_base;
msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
@@ -257,8 +260,7 @@ static int write_msi_msg(struct msi_desc
}
case PCI_CAP_ID_MSIX:
{
- void __iomem *base;
- base = entry->mask_base;
+ void __iomem *base = entry->mask_base;
writel(msg->address_lo,
base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
@@ -281,7 +283,7 @@ void set_msi_affinity(struct irq_desc *d
struct msi_desc *msi_desc = desc->msi_desc;
dest = set_desc_affinity(desc, mask);
- if (dest == BAD_APICID || !msi_desc)
+ if ( dest == BAD_APICID || !msi_desc )
return;
ASSERT(spin_is_locked(&desc->lock));
@@ -332,11 +334,11 @@ static void msix_set_enable(struct pci_d
pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
if ( pos )
{
- control = pci_conf_read16(seg, bus, slot, func, pos + PCI_MSIX_FLAGS);
+ control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
control &= ~PCI_MSIX_FLAGS_ENABLE;
if ( enable )
control |= PCI_MSIX_FLAGS_ENABLE;
- pci_conf_write16(seg, bus, slot, func, pos + PCI_MSIX_FLAGS, control);
+ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
}
}
@@ -353,9 +355,11 @@ static void msi_set_mask_bit(struct irq_
ASSERT(spin_is_locked(&desc->lock));
BUG_ON(!entry || !entry->dev);
- switch (entry->msi_attrib.type) {
+ switch ( entry->msi_attrib.type )
+ {
case PCI_CAP_ID_MSI:
- if (entry->msi_attrib.maskbit) {
+ if ( entry->msi_attrib.maskbit )
+ {
u32 mask_bits;
u16 seg = entry->dev->seg;
u8 bus = entry->dev->bus;
@@ -701,13 +705,14 @@ static u64 read_pci_mem_bar(u16 seg, u8
* requested MSI-X entries with allocated irqs or non-zero for otherwise.
**/
static int msix_capability_init(struct pci_dev *dev,
+ unsigned int pos,
struct msi_info *msi,
struct msi_desc **desc,
unsigned int nr_entries)
{
struct arch_msix *msix = dev->msix;
struct msi_desc *entry = NULL;
- int pos, vf;
+ int vf;
u16 control;
u64 table_paddr;
u32 table_offset;
@@ -719,7 +724,6 @@ static int msix_capability_init(struct p
ASSERT(spin_is_locked(&pcidevs_lock));
- pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
@@ -884,10 +888,9 @@ static int __pci_enable_msi(struct msi_i
old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSI);
if ( old_desc )
{
- dprintk(XENLOG_WARNING, "irq %d has already mapped to MSI on "
- "device %04x:%02x:%02x.%01x\n",
- msi->irq, msi->seg, msi->bus,
- PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+ printk(XENLOG_WARNING "irq %d already mapped to MSI on %04x:%02x:%02x.%u\n",
+ msi->irq, msi->seg, msi->bus,
+ PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
*desc = old_desc;
return 0;
}
@@ -895,10 +898,10 @@ static int __pci_enable_msi(struct msi_i
old_desc = find_msi_entry(pdev, -1, PCI_CAP_ID_MSIX);
if ( old_desc )
{
- dprintk(XENLOG_WARNING, "MSI-X is already in use on "
- "device %04x:%02x:%02x.%01x\n", msi->seg, msi->bus,
- PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
- pci_disable_msi(old_desc);
+ printk(XENLOG_WARNING "MSI-X already in use on %04x:%02x:%02x.%u\n",
+ msi->seg, msi->bus,
+ PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+ __pci_disable_msix(old_desc);
}
return msi_capability_init(pdev, msi->irq, desc, msi->entry_nr);
@@ -912,7 +915,6 @@ static void __pci_disable_msi(struct msi
msi_set_enable(dev, 0);
BUG_ON(list_empty(&dev->msi_list));
-
}
/**
@@ -932,7 +934,7 @@ static void __pci_disable_msi(struct msi
**/
static int __pci_enable_msix(struct msi_info *msi, struct msi_desc **desc)
{
- int status, pos, nr_entries;
+ int pos, nr_entries;
struct pci_dev *pdev;
u16 control;
u8 slot = PCI_SLOT(msi->devfn);
@@ -941,23 +943,22 @@ static int __pci_enable_msix(struct msi_
ASSERT(spin_is_locked(&pcidevs_lock));
pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn);
- if ( !pdev )
+ pos = pci_find_cap_offset(msi->seg, msi->bus, slot, func, PCI_CAP_ID_MSIX);
+ if ( !pdev || !pos )
return -ENODEV;
- pos = pci_find_cap_offset(msi->seg, msi->bus, slot, func, PCI_CAP_ID_MSIX);
control = pci_conf_read16(msi->seg, msi->bus, slot, func,
msix_control_reg(pos));
nr_entries = multi_msix_capable(control);
- if (msi->entry_nr >= nr_entries)
+ if ( msi->entry_nr >= nr_entries )
return -EINVAL;
old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSIX);
if ( old_desc )
{
- dprintk(XENLOG_WARNING, "irq %d has already mapped to MSIX on "
- "device %04x:%02x:%02x.%01x\n",
- msi->irq, msi->seg, msi->bus,
- PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+ printk(XENLOG_WARNING "irq %d already mapped to MSI-X on %04x:%02x:%02x.%u\n",
+ msi->irq, msi->seg, msi->bus,
+ PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
*desc = old_desc;
return 0;
}
@@ -965,15 +966,13 @@ static int __pci_enable_msix(struct msi_
old_desc = find_msi_entry(pdev, -1, PCI_CAP_ID_MSI);
if ( old_desc )
{
- dprintk(XENLOG_WARNING, "MSI is already in use on "
- "device %04x:%02x:%02x.%01x\n", msi->seg, msi->bus,
- PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
- pci_disable_msi(old_desc);
-
+ printk(XENLOG_WARNING "MSI already in use on %04x:%02x:%02x.%u\n",
+ msi->seg, msi->bus,
+ PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+ __pci_disable_msi(old_desc);
}
- status = msix_capability_init(pdev, msi, desc, nr_entries);
- return status;
+ return msix_capability_init(pdev, pos, msi, desc, nr_entries);
}
static void _pci_cleanup_msix(struct arch_msix *msix)
@@ -991,19 +990,16 @@ static void _pci_cleanup_msix(struct arc
static void __pci_disable_msix(struct msi_desc *entry)
{
- struct pci_dev *dev;
- int pos;
- u16 control, seg;
- u8 bus, slot, func;
-
- dev = entry->dev;
- seg = dev->seg;
- bus = dev->bus;
- slot = PCI_SLOT(dev->devfn);
- func = PCI_FUNC(dev->devfn);
+ struct pci_dev *dev = entry->dev;
+ u16 seg = dev->seg;
+ u8 bus = dev->bus;
+ u8 slot = PCI_SLOT(dev->devfn);
+ u8 func = PCI_FUNC(dev->devfn);
+ unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
+ PCI_CAP_ID_MSIX);
+ u16 control = pci_conf_read16(seg, bus, slot, func,
+ msix_control_reg(entry->msi_attrib.pos));
- pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
- control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
msix_set_enable(dev, 0);
BUG_ON(list_empty(&dev->msi_list));
@@ -1045,7 +1041,7 @@ int pci_prepare_msix(u16 seg, u8 bus, u8
u16 control = pci_conf_read16(seg, bus, slot, func,
msix_control_reg(pos));
- rc = msix_capability_init(pdev, NULL, NULL,
+ rc = msix_capability_init(pdev, pos, NULL, NULL,
multi_msix_capable(control));
}
spin_unlock(&pcidevs_lock);
@@ -1064,8 +1060,8 @@ int pci_enable_msi(struct msi_info *msi,
if ( !use_msi )
return -EPERM;
- return msi->table_base ? __pci_enable_msix(msi, desc) :
- __pci_enable_msi(msi, desc);
+ return msi->table_base ? __pci_enable_msix(msi, desc) :
+ __pci_enable_msi(msi, desc);
}
/*
@@ -1115,7 +1111,9 @@ int pci_restore_msi_state(struct pci_dev
if ( !pdev )
return -EINVAL;
- ret = xsm_resource_setup_pci(XSM_PRIV, (pdev->seg << 16) | (pdev->bus << 8) | pdev->devfn);
+ ret = xsm_resource_setup_pci(XSM_PRIV,
+ (pdev->seg << 16) | (pdev->bus << 8) |
+ pdev->devfn);
if ( ret )
return ret;

View File

@ -1,388 +0,0 @@
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
# Commit ad28e42bd1d28d746988ed71654e8aa670629753
# Date 2015-06-19 10:59:53 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/MSI: track host and guest masking separately
In particular we want to avoid losing track of our own intention to
have an entry masked. Physical unmasking now happens only when both
host and guest requested so.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
# Commit 84d6add5593d865736831d150da7c38588f669f6
# Date 2015-07-10 12:36:24 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/MSI: fix guest unmasking when handling IRQ via event channel
Rather than assuming only PV guests need special treatment (and
dealing with that directly when an IRQ gets set up), keep all guest MSI
IRQs masked until either the (HVM) guest unmasks them via vMSI or the
(PV, PVHVM, or PVH) guest sets up an event channel for it.
To not further clutter the common evtchn_bind_pirq() with x86-specific
code, introduce an arch_evtchn_bind_pirq() hook instead.
Reported-by: Sander Eikelenboom <linux@eikelenboom.it>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Tested-by: Sander Eikelenboom <linux@eikelenboom.it>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
--- a/xen/arch/x86/hpet.c
+++ b/xen/arch/x86/hpet.c
@@ -240,7 +240,7 @@ static void hpet_msi_unmask(struct irq_d
cfg = hpet_read32(HPET_Tn_CFG(ch->idx));
cfg |= HPET_TN_ENABLE;
hpet_write32(cfg, HPET_Tn_CFG(ch->idx));
- ch->msi.msi_attrib.masked = 0;
+ ch->msi.msi_attrib.host_masked = 0;
}
static void hpet_msi_mask(struct irq_desc *desc)
@@ -251,7 +251,7 @@ static void hpet_msi_mask(struct irq_des
cfg = hpet_read32(HPET_Tn_CFG(ch->idx));
cfg &= ~HPET_TN_ENABLE;
hpet_write32(cfg, HPET_Tn_CFG(ch->idx));
- ch->msi.msi_attrib.masked = 1;
+ ch->msi.msi_attrib.host_masked = 1;
}
static int hpet_msi_write(struct hpet_event_channel *ch, struct msi_msg *msg)
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -219,7 +219,6 @@ static int msixtbl_read(
{
unsigned long offset;
struct msixtbl_entry *entry;
- void *virt;
unsigned int nr_entry, index;
int r = X86EMUL_UNHANDLEABLE;
@@ -253,13 +252,20 @@ static int msixtbl_read(
}
if ( offset == PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
{
- virt = msixtbl_addr_to_virt(entry, address);
+ const struct msi_desc *msi_desc;
+ void *virt = msixtbl_addr_to_virt(entry, address);
+
if ( !virt )
goto out;
+ msi_desc = virt_to_msi_desc(entry->pdev, virt);
+ if ( !msi_desc )
+ goto out;
if ( len == 4 )
- *pval = readl(virt);
+ *pval = MASK_INSR(msi_desc->msi_attrib.guest_masked,
+ PCI_MSIX_VECTOR_BITMASK);
else
- *pval |= (u64)readl(virt) << 32;
+ *pval |= (u64)MASK_INSR(msi_desc->msi_attrib.guest_masked,
+ PCI_MSIX_VECTOR_BITMASK) << 32;
}
r = X86EMUL_OKAY;
@@ -277,7 +283,7 @@ static int msixtbl_write(struct vcpu *v,
void *virt;
unsigned int nr_entry, index;
int r = X86EMUL_UNHANDLEABLE;
- unsigned long flags, orig;
+ unsigned long flags;
struct irq_desc *desc;
if ( (len != 4 && len != 8) || (address & (len - 1)) )
@@ -337,37 +343,7 @@ static int msixtbl_write(struct vcpu *v,
ASSERT(msi_desc == desc->msi_desc);
- orig = readl(virt);
-
- /*
- * Do not allow guest to modify MSI-X control bit if it is masked
- * by Xen. We'll only handle the case where Xen thinks that
- * bit is unmasked, but hardware has silently masked the bit
- * (in case of SR-IOV VF reset, etc). On the other hand, if Xen
- * thinks that the bit is masked, but it's really not,
- * we log a warning.
- */
- if ( msi_desc->msi_attrib.masked )
- {
- if ( !(orig & PCI_MSIX_VECTOR_BITMASK) )
- printk(XENLOG_WARNING "MSI-X control bit is unmasked when"
- " it is expected to be masked [%04x:%02x:%02x.%u]\n",
- entry->pdev->seg, entry->pdev->bus,
- PCI_SLOT(entry->pdev->devfn),
- PCI_FUNC(entry->pdev->devfn));
-
- goto unlock;
- }
-
- /*
- * The mask bit is the only defined bit in the word. But we
- * ought to preserve the reserved bits. Clearing the reserved
- * bits can result in undefined behaviour (see PCI Local Bus
- * Specification revision 2.3).
- */
- val &= PCI_MSIX_VECTOR_BITMASK;
- val |= (orig & ~PCI_MSIX_VECTOR_BITMASK);
- writel(val, virt);
+ guest_mask_msi_irq(desc, !!(val & PCI_MSIX_VECTOR_BITMASK));
unlock:
spin_unlock_irqrestore(&desc->lock, flags);
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -2502,6 +2502,25 @@ int unmap_domain_pirq_emuirq(struct doma
return ret;
}
+void arch_evtchn_bind_pirq(struct domain *d, int pirq)
+{
+ int irq = domain_pirq_to_irq(d, pirq);
+ struct irq_desc *desc;
+ unsigned long flags;
+
+ if ( irq <= 0 )
+ return;
+
+ if ( is_hvm_domain(d) )
+ map_domain_emuirq_pirq(d, pirq, IRQ_PT);
+
+ desc = irq_to_desc(irq);
+ spin_lock_irqsave(&desc->lock, flags);
+ if ( desc->msi_desc )
+ guest_mask_msi_irq(desc, 0);
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
+
bool_t hvm_domain_use_pirq(const struct domain *d, const struct pirq *pirq)
{
return is_hvm_domain(d) && pirq &&
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -349,9 +349,10 @@ int msi_maskable_irq(const struct msi_de
|| entry->msi_attrib.maskbit;
}
-static void msi_set_mask_bit(struct irq_desc *desc, int flag)
+static void msi_set_mask_bit(struct irq_desc *desc, bool_t host, bool_t guest)
{
struct msi_desc *entry = desc->msi_desc;
+ bool_t flag = host || guest;
ASSERT(spin_is_locked(&desc->lock));
BUG_ON(!entry || !entry->dev);
@@ -383,7 +384,8 @@ static void msi_set_mask_bit(struct irq_
BUG();
break;
}
- entry->msi_attrib.masked = !!flag;
+ entry->msi_attrib.host_masked = host;
+ entry->msi_attrib.guest_masked = guest;
}
static int msi_get_mask_bit(const struct msi_desc *entry)
@@ -405,20 +407,30 @@ static int msi_get_mask_bit(const struct
void mask_msi_irq(struct irq_desc *desc)
{
- msi_set_mask_bit(desc, 1);
+ msi_set_mask_bit(desc, 1, desc->msi_desc->msi_attrib.guest_masked);
}
void unmask_msi_irq(struct irq_desc *desc)
{
- msi_set_mask_bit(desc, 0);
+ msi_set_mask_bit(desc, 0, desc->msi_desc->msi_attrib.guest_masked);
+}
+
+void guest_mask_msi_irq(struct irq_desc *desc, bool_t mask)
+{
+ msi_set_mask_bit(desc, desc->msi_desc->msi_attrib.host_masked, mask);
}
static unsigned int startup_msi_irq(struct irq_desc *desc)
{
- unmask_msi_irq(desc);
+ msi_set_mask_bit(desc, 0, !!(desc->status & IRQ_GUEST));
return 0;
}
+static void shutdown_msi_irq(struct irq_desc *desc)
+{
+ msi_set_mask_bit(desc, 1, 1);
+}
+
void ack_nonmaskable_msi_irq(struct irq_desc *desc)
{
irq_complete_move(desc);
@@ -443,7 +455,7 @@ void end_nonmaskable_msi_irq(struct irq_
static hw_irq_controller pci_msi_maskable = {
.typename = "PCI-MSI/-X",
.startup = startup_msi_irq,
- .shutdown = mask_msi_irq,
+ .shutdown = shutdown_msi_irq,
.enable = unmask_msi_irq,
.disable = mask_msi_irq,
.ack = ack_maskable_msi_irq,
@@ -591,7 +603,8 @@ static int msi_capability_init(struct pc
entry[i].msi_attrib.is_64 = is_64bit_address(control);
entry[i].msi_attrib.entry_nr = i;
entry[i].msi_attrib.maskbit = is_mask_bit_support(control);
- entry[i].msi_attrib.masked = 1;
+ entry[i].msi_attrib.host_masked = 1;
+ entry[i].msi_attrib.guest_masked = 0;
entry[i].msi_attrib.pos = pos;
if ( entry[i].msi_attrib.maskbit )
entry[i].msi.mpos = mpos;
@@ -817,7 +830,8 @@ static int msix_capability_init(struct p
entry->msi_attrib.is_64 = 1;
entry->msi_attrib.entry_nr = msi->entry_nr;
entry->msi_attrib.maskbit = 1;
- entry->msi_attrib.masked = 1;
+ entry->msi_attrib.host_masked = 1;
+ entry->msi_attrib.guest_masked = 1;
entry->msi_attrib.pos = pos;
entry->irq = msi->irq;
entry->dev = dev;
@@ -1152,7 +1166,8 @@ int pci_restore_msi_state(struct pci_dev
for ( i = 0; ; )
{
- msi_set_mask_bit(desc, entry[i].msi_attrib.masked);
+ msi_set_mask_bit(desc, entry[i].msi_attrib.host_masked,
+ entry[i].msi_attrib.guest_masked);
if ( !--nr )
break;
@@ -1304,7 +1319,7 @@ static void dump_msi(unsigned char key)
else
mask = '?';
printk(" %-6s%4u vec=%02x%7s%6s%3sassert%5s%7s"
- " dest=%08x mask=%d/%d/%c\n",
+ " dest=%08x mask=%d/%c%c/%c\n",
type, irq,
(data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT,
data & MSI_DATA_DELIVERY_LOWPRI ? "lowest" : "fixed",
@@ -1312,7 +1327,10 @@ static void dump_msi(unsigned char key)
data & MSI_DATA_LEVEL_ASSERT ? "" : "de",
addr & MSI_ADDR_DESTMODE_LOGIC ? "log" : "phys",
addr & MSI_ADDR_REDIRECTION_LOWPRI ? "lowest" : "cpu",
- dest32, attr.maskbit, attr.masked, mask);
+ dest32, attr.maskbit,
+ attr.host_masked ? 'H' : ' ',
+ attr.guest_masked ? 'G' : ' ',
+ mask);
}
}
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -445,10 +445,7 @@ static long evtchn_bind_pirq(evtchn_bind
bind->port = port;
-#ifdef CONFIG_X86
- if ( is_hvm_domain(d) && domain_pirq_to_irq(d, pirq) > 0 )
- map_domain_emuirq_pirq(d, pirq, IRQ_PT);
-#endif
+ arch_evtchn_bind_pirq(d, pirq);
out:
spin_unlock(&d->event_lock);
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -451,7 +451,7 @@ static void iommu_msi_unmask(struct irq_
spin_lock_irqsave(&iommu->lock, flags);
amd_iommu_msi_enable(iommu, IOMMU_CONTROL_ENABLED);
spin_unlock_irqrestore(&iommu->lock, flags);
- iommu->msi.msi_attrib.masked = 0;
+ iommu->msi.msi_attrib.host_masked = 0;
}
static void iommu_msi_mask(struct irq_desc *desc)
@@ -464,7 +464,7 @@ static void iommu_msi_mask(struct irq_de
spin_lock_irqsave(&iommu->lock, flags);
amd_iommu_msi_enable(iommu, IOMMU_CONTROL_DISABLED);
spin_unlock_irqrestore(&iommu->lock, flags);
- iommu->msi.msi_attrib.masked = 1;
+ iommu->msi.msi_attrib.host_masked = 1;
}
static unsigned int iommu_msi_startup(struct irq_desc *desc)
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -996,7 +996,7 @@ static void dma_msi_unmask(struct irq_de
spin_lock_irqsave(&iommu->register_lock, flags);
dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
spin_unlock_irqrestore(&iommu->register_lock, flags);
- iommu->msi.msi_attrib.masked = 0;
+ iommu->msi.msi_attrib.host_masked = 0;
}
static void dma_msi_mask(struct irq_desc *desc)
@@ -1008,7 +1008,7 @@ static void dma_msi_mask(struct irq_desc
spin_lock_irqsave(&iommu->register_lock, flags);
dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM);
spin_unlock_irqrestore(&iommu->register_lock, flags);
- iommu->msi.msi_attrib.masked = 1;
+ iommu->msi.msi_attrib.host_masked = 1;
}
static unsigned int dma_msi_startup(struct irq_desc *desc)
--- a/xen/include/asm-arm/irq.h
+++ b/xen/include/asm-arm/irq.h
@@ -44,6 +44,8 @@ int route_irq_to_guest(struct domain *d,
const char *devname);
void arch_move_irqs(struct vcpu *v);
+#define arch_evtchn_bind_pirq(d, pirq) ((void)((d) + (pirq)))
+
/* Set IRQ type for an SPI */
int irq_set_spi_type(unsigned int spi, unsigned int type);
--- a/xen/include/asm-x86/msi.h
+++ b/xen/include/asm-x86/msi.h
@@ -90,12 +90,13 @@ extern unsigned int pci_msix_get_table_l
struct msi_desc {
struct msi_attrib {
- __u8 type : 5; /* {0: unused, 5h:MSI, 11h:MSI-X} */
- __u8 maskbit : 1; /* mask-pending bit supported ? */
- __u8 masked : 1;
+ __u8 type; /* {0: unused, 5h:MSI, 11h:MSI-X} */
+ __u8 pos; /* Location of the MSI capability */
+ __u8 maskbit : 1; /* mask/pending bit supported ? */
__u8 is_64 : 1; /* Address size: 0=32bit 1=64bit */
- __u8 pos; /* Location of the msi capability */
- __u16 entry_nr; /* specific enabled entry */
+ __u8 host_masked : 1;
+ __u8 guest_masked : 1;
+ __u16 entry_nr; /* specific enabled entry */
} msi_attrib;
struct list_head list;
@@ -236,6 +237,7 @@ void msi_compose_msg(unsigned vector, co
void __msi_set_enable(u16 seg, u8 bus, u8 slot, u8 func, int pos, int enable);
void mask_msi_irq(struct irq_desc *);
void unmask_msi_irq(struct irq_desc *);
+void guest_mask_msi_irq(struct irq_desc *, bool_t mask);
void ack_nonmaskable_msi_irq(struct irq_desc *);
void end_nonmaskable_msi_irq(struct irq_desc *, u8 vector);
void set_msi_affinity(struct irq_desc *, const cpumask_t *);
--- a/xen/include/xen/irq.h
+++ b/xen/include/xen/irq.h
@@ -172,4 +172,8 @@ unsigned int set_desc_affinity(struct ir
unsigned int arch_hwdom_irqs(domid_t);
#endif
+#ifndef arch_evtchn_bind_pirq
+void arch_evtchn_bind_pirq(struct domain *, int pirq);
+#endif
+
#endif /* __XEN_IRQ_H__ */

View File

@ -1,284 +0,0 @@
# Commit dff515dfeac4c1c13422a128c558ac21ddc6c8db
# Date 2015-06-19 11:01:24 +0200
# Author Malcolm Crossley <malcolm.crossley@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
gnttab: use per-VCPU maptrack free lists
Performance analysis of aggregate network throughput with many VMs
shows that performance is signficantly limited by contention on the
maptrack lock when obtaining/releasing maptrack handles from the free
list.
Instead of a single free list use a per-VCPU list. This avoids any
contention when obtaining a handle. Handles must be released back to
their original list and since this may occur on a different VCPU there
is some contention on the destination VCPU's free list tail pointer
(but this is much better than a per-domain lock).
Increase the default maximum number of maptrack frames by 4 times
because: a) struct grant_mapping is now 16 bytes (instead of 8); and
b) a guest may not evenly distribute all the grant map operations
across the VCPUs (meaning some VCPUs need more maptrack entries than
others).
Signed-off-by: Malcolm Crossley <malcolm.crossley@citrix.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -126,6 +126,8 @@ struct vcpu *alloc_vcpu(
tasklet_init(&v->continue_hypercall_tasklet, NULL, 0);
+ grant_table_init_vcpu(v);
+
if ( !zalloc_cpumask_var(&v->cpu_hard_affinity) ||
!zalloc_cpumask_var(&v->cpu_hard_affinity_tmp) ||
!zalloc_cpumask_var(&v->cpu_hard_affinity_saved) ||
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -37,6 +37,7 @@
#include <xen/iommu.h>
#include <xen/paging.h>
#include <xen/keyhandler.h>
+#include <xen/vmap.h>
#include <xsm/xsm.h>
#include <asm/flushtlb.h>
@@ -57,7 +58,7 @@ integer_param("gnttab_max_frames", max_g
* New options allow to set max_maptrack_frames and
* map_grant_table_frames independently.
*/
-#define DEFAULT_MAX_MAPTRACK_FRAMES 256
+#define DEFAULT_MAX_MAPTRACK_FRAMES 1024
static unsigned int __read_mostly max_maptrack_frames;
integer_param("gnttab_max_maptrack_frames", max_maptrack_frames);
@@ -279,62 +280,103 @@ double_gt_unlock(struct grant_table *lgt
static inline int
__get_maptrack_handle(
- struct grant_table *t)
+ struct grant_table *t,
+ struct vcpu *v)
{
- unsigned int h;
- if ( unlikely((h = t->maptrack_head) == MAPTRACK_TAIL) )
+ unsigned int head, next;
+
+ /* No maptrack pages allocated for this VCPU yet? */
+ head = v->maptrack_head;
+ if ( unlikely(head == MAPTRACK_TAIL) )
return -1;
- t->maptrack_head = maptrack_entry(t, h).ref;
- return h;
+
+ /*
+ * Always keep one entry in the free list to make it easier to add
+ * free entries to the tail.
+ */
+ next = read_atomic(&maptrack_entry(t, head).ref);
+ if ( unlikely(next == MAPTRACK_TAIL) )
+ return -1;
+
+ v->maptrack_head = next;
+
+ return head;
}
static inline void
put_maptrack_handle(
struct grant_table *t, int handle)
{
- spin_lock(&t->maptrack_lock);
- maptrack_entry(t, handle).ref = t->maptrack_head;
- t->maptrack_head = handle;
- spin_unlock(&t->maptrack_lock);
+ struct domain *currd = current->domain;
+ struct vcpu *v;
+ unsigned int prev_tail, cur_tail;
+
+ /* 1. Set entry to be a tail. */
+ maptrack_entry(t, handle).ref = MAPTRACK_TAIL;
+
+ /* 2. Add entry to the tail of the list on the original VCPU. */
+ v = currd->vcpu[maptrack_entry(t, handle).vcpu];
+
+ cur_tail = read_atomic(&v->maptrack_tail);
+ do {
+ prev_tail = cur_tail;
+ cur_tail = cmpxchg(&v->maptrack_tail, prev_tail, handle);
+ } while ( cur_tail != prev_tail );
+
+ /* 3. Update the old tail entry to point to the new entry. */
+ write_atomic(&maptrack_entry(t, prev_tail).ref, handle);
}
static inline int
get_maptrack_handle(
struct grant_table *lgt)
{
+ struct vcpu *curr = current;
int i;
grant_handle_t handle;
struct grant_mapping *new_mt;
- unsigned int new_mt_limit, nr_frames;
+
+ handle = __get_maptrack_handle(lgt, curr);
+ if ( likely(handle != -1) )
+ return handle;
spin_lock(&lgt->maptrack_lock);
- while ( unlikely((handle = __get_maptrack_handle(lgt)) == -1) )
+ if ( nr_maptrack_frames(lgt) >= max_maptrack_frames )
{
- nr_frames = nr_maptrack_frames(lgt);
- if ( nr_frames >= max_maptrack_frames )
- break;
+ spin_unlock(&lgt->maptrack_lock);
+ return -1;
+ }
- new_mt = alloc_xenheap_page();
- if ( !new_mt )
- break;
+ new_mt = alloc_xenheap_page();
+ if ( !new_mt )
+ {
+ spin_unlock(&lgt->maptrack_lock);
+ return -1;
+ }
+ clear_page(new_mt);
- clear_page(new_mt);
+ /*
+ * Use the first new entry and add the remaining entries to the
+ * head of the free list.
+ */
+ handle = lgt->maptrack_limit;
- new_mt_limit = lgt->maptrack_limit + MAPTRACK_PER_PAGE;
+ for ( i = 0; i < MAPTRACK_PER_PAGE; i++ )
+ {
+ new_mt[i].ref = handle + i + 1;
+ new_mt[i].vcpu = curr->vcpu_id;
+ }
+ new_mt[i - 1].ref = curr->maptrack_head;
- for ( i = 1; i < MAPTRACK_PER_PAGE; i++ )
- new_mt[i - 1].ref = lgt->maptrack_limit + i;
- new_mt[i - 1].ref = lgt->maptrack_head;
- lgt->maptrack_head = lgt->maptrack_limit;
+ /* Set tail directly if this is the first page for this VCPU. */
+ if ( curr->maptrack_tail == MAPTRACK_TAIL )
+ curr->maptrack_tail = handle + MAPTRACK_PER_PAGE - 1;
- lgt->maptrack[nr_frames] = new_mt;
- smp_wmb();
- lgt->maptrack_limit = new_mt_limit;
+ curr->maptrack_head = handle + 1;
- gdprintk(XENLOG_INFO, "Increased maptrack size to %u frames\n",
- nr_frames + 1);
- }
+ lgt->maptrack[nr_maptrack_frames(lgt)] = new_mt;
+ lgt->maptrack_limit += MAPTRACK_PER_PAGE;
spin_unlock(&lgt->maptrack_lock);
@@ -3061,16 +3103,9 @@ grant_table_create(
}
/* Tracking of mapped foreign frames table */
- if ( (t->maptrack = xzalloc_array(struct grant_mapping *,
- max_maptrack_frames)) == NULL )
+ t->maptrack = vzalloc(max_maptrack_frames * sizeof(*t->maptrack));
+ if ( t->maptrack == NULL )
goto no_mem_2;
- if ( (t->maptrack[0] = alloc_xenheap_page()) == NULL )
- goto no_mem_3;
- clear_page(t->maptrack[0]);
- t->maptrack_limit = MAPTRACK_PER_PAGE;
- for ( i = 1; i < MAPTRACK_PER_PAGE; i++ )
- t->maptrack[0][i - 1].ref = i;
- t->maptrack[0][i - 1].ref = MAPTRACK_TAIL;
/* Shared grant table. */
if ( (t->shared_raw = xzalloc_array(void *, max_grant_frames)) == NULL )
@@ -3102,8 +3137,7 @@ grant_table_create(
free_xenheap_page(t->shared_raw[i]);
xfree(t->shared_raw);
no_mem_3:
- free_xenheap_page(t->maptrack[0]);
- xfree(t->maptrack);
+ vfree(t->maptrack);
no_mem_2:
for ( i = 0;
i < num_act_frames_from_sha_frames(INITIAL_NR_GRANT_FRAMES); i++ )
@@ -3238,7 +3272,7 @@ grant_table_destroy(
for ( i = 0; i < nr_maptrack_frames(t); i++ )
free_xenheap_page(t->maptrack[i]);
- xfree(t->maptrack);
+ vfree(t->maptrack);
for ( i = 0; i < nr_active_grant_frames(t); i++ )
free_xenheap_page(t->active[i]);
@@ -3252,6 +3286,12 @@ grant_table_destroy(
d->grant_table = NULL;
}
+void grant_table_init_vcpu(struct vcpu *v)
+{
+ v->maptrack_head = MAPTRACK_TAIL;
+ v->maptrack_tail = MAPTRACK_TAIL;
+}
+
static void gnttab_usage_print(struct domain *rd)
{
int first = 1;
--- a/xen/include/xen/grant_table.h
+++ b/xen/include/xen/grant_table.h
@@ -60,6 +60,8 @@ struct grant_mapping {
u32 ref; /* grant ref */
u16 flags; /* 0-4: GNTMAP_* ; 5-15: unused */
domid_t domid; /* granting domain */
+ u32 vcpu; /* vcpu which created the grant mapping */
+ u32 pad; /* round size to a power of 2 */
};
/* Per-domain grant information. */
@@ -83,9 +85,8 @@ struct grant_table {
grant_status_t **status;
/* Active grant table. */
struct active_grant_entry **active;
- /* Mapping tracking table. */
+ /* Mapping tracking table per vcpu. */
struct grant_mapping **maptrack;
- unsigned int maptrack_head;
unsigned int maptrack_limit;
/* Lock protecting the maptrack page list, head, and limit */
spinlock_t maptrack_lock;
@@ -99,6 +100,7 @@ int grant_table_create(
struct domain *d);
void grant_table_destroy(
struct domain *d);
+void grant_table_init_vcpu(struct vcpu *v);
/* Domain death release of granted mappings of other domains' memory. */
void
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -219,6 +219,10 @@ struct vcpu
/* VCPU paused by system controller. */
int controller_pause_count;
+ /* Maptrack */
+ unsigned int maptrack_head;
+ unsigned int maptrack_tail;
+
/* IRQ-safe virq_lock protects against delivering VIRQ to stale evtchn. */
evtchn_port_t virq_to_evtchn[NR_VIRQS];
spinlock_t virq_lock;

View File

@ -1,153 +0,0 @@
# Commit e76ff6c156906b515c2a4300a81c95886ece5d5f
# Date 2015-06-19 11:02:04 +0200
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
gnttab: steal maptrack entries from other VCPUs
If a guest is not evenly grant mapping across its VCPUs one of the
VCPUs may run out of free maptrack entries even though other VCPUs
have many free.
If this happens, "steal" free entries from other VCPUs. We want to
steal entries such that:
a) We avoid ping-ponging stolen entries between VCPUs.
b) The number of free entries owned by each VCPUs tends (over time) to
the number it uses.
So when stealing, we select a VCPU at random (reducing (a)) and we
transfer the stolen entries to the thief VCPU (aiming for (b)).
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -283,26 +283,70 @@ __get_maptrack_handle(
struct grant_table *t,
struct vcpu *v)
{
- unsigned int head, next;
+ unsigned int head, next, prev_head;
- /* No maptrack pages allocated for this VCPU yet? */
- head = v->maptrack_head;
- if ( unlikely(head == MAPTRACK_TAIL) )
- return -1;
-
- /*
- * Always keep one entry in the free list to make it easier to add
- * free entries to the tail.
- */
- next = read_atomic(&maptrack_entry(t, head).ref);
- if ( unlikely(next == MAPTRACK_TAIL) )
- return -1;
+ do {
+ /* No maptrack pages allocated for this VCPU yet? */
+ head = read_atomic(&v->maptrack_head);
+ if ( unlikely(head == MAPTRACK_TAIL) )
+ return -1;
- v->maptrack_head = next;
+ /*
+ * Always keep one entry in the free list to make it easier to
+ * add free entries to the tail.
+ */
+ next = read_atomic(&maptrack_entry(t, head).ref);
+ if ( unlikely(next == MAPTRACK_TAIL) )
+ return -1;
+
+ prev_head = head;
+ head = cmpxchg(&v->maptrack_head, prev_head, next);
+ } while ( head != prev_head );
return head;
}
+/*
+ * Try to "steal" a free maptrack entry from another VCPU.
+ *
+ * A stolen entry is transferred to the thief, so the number of
+ * entries for each VCPU should tend to the usage pattern.
+ *
+ * To avoid having to atomically count the number of free entries on
+ * each VCPU and to avoid two VCPU repeatedly stealing entries from
+ * each other, the initial victim VCPU is selected randomly.
+ */
+static int steal_maptrack_handle(struct grant_table *t,
+ const struct vcpu *curr)
+{
+ const struct domain *currd = curr->domain;
+ unsigned int first, i;
+
+ /* Find an initial victim. */
+ first = i = get_random() % currd->max_vcpus;
+
+ do {
+ if ( currd->vcpu[i] )
+ {
+ int handle;
+
+ handle = __get_maptrack_handle(t, currd->vcpu[i]);
+ if ( handle != -1 )
+ {
+ maptrack_entry(t, handle).vcpu = curr->vcpu_id;
+ return handle;
+ }
+ }
+
+ i++;
+ if ( i == currd->max_vcpus )
+ i = 0;
+ } while ( i != first );
+
+ /* No free handles on any VCPU. */
+ return -1;
+}
+
static inline void
put_maptrack_handle(
struct grant_table *t, int handle)
@@ -342,10 +386,31 @@ get_maptrack_handle(
spin_lock(&lgt->maptrack_lock);
+ /*
+ * If we've run out of frames, try stealing an entry from another
+ * VCPU (in case the guest isn't mapping across its VCPUs evenly).
+ */
if ( nr_maptrack_frames(lgt) >= max_maptrack_frames )
{
+ /*
+ * Can drop the lock since no other VCPU can be adding a new
+ * frame once they've run out.
+ */
spin_unlock(&lgt->maptrack_lock);
- return -1;
+
+ /*
+ * Uninitialized free list? Steal an extra entry for the tail
+ * sentinel.
+ */
+ if ( curr->maptrack_tail == MAPTRACK_TAIL )
+ {
+ handle = steal_maptrack_handle(lgt, curr);
+ if ( handle == -1 )
+ return -1;
+ curr->maptrack_tail = handle;
+ write_atomic(&curr->maptrack_head, handle);
+ }
+ return steal_maptrack_handle(lgt, curr);
}
new_mt = alloc_xenheap_page();
@@ -373,7 +438,7 @@ get_maptrack_handle(
if ( curr->maptrack_tail == MAPTRACK_TAIL )
curr->maptrack_tail = handle + MAPTRACK_PER_PAGE - 1;
- curr->maptrack_head = handle + 1;
+ write_atomic(&curr->maptrack_head, handle + 1);
lgt->maptrack[nr_maptrack_frames(lgt)] = new_mt;
lgt->maptrack_limit += MAPTRACK_PER_PAGE;

View File

@ -1,105 +0,0 @@
# Commit b399386bcdb9d458f5647476a06fe86f5968d87e
# Date 2015-06-22 11:36:17 +0200
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
evtchn: clear xen_consumer when clearing state
Freeing a xen event channel would clear xen_consumer before clearing
the channel state, leaving a window where the channel is in a funny
state (still bound but no consumer).
Move the clear of xen_consumer into free_evtchn() where the state is
also cleared.
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Ditch the pointless evtchn_close() wrapper around __evtchn_close()
(renaming the latter) as well as some bogus casts of function results
to void.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -204,6 +204,7 @@ static void free_evtchn(struct domain *d
/* Reset binding to vcpu0 when the channel is freed. */
chn->state = ECS_FREE;
chn->notify_vcpu_id = 0;
+ chn->xen_consumer = 0;
xsm_evtchn_close_post(chn);
}
@@ -467,7 +468,7 @@ static long evtchn_bind_pirq(evtchn_bind
}
-static long __evtchn_close(struct domain *d1, int port1)
+static long evtchn_close(struct domain *d1, int port1, bool_t guest)
{
struct domain *d2 = NULL;
struct vcpu *v;
@@ -487,7 +488,7 @@ static long __evtchn_close(struct domain
chn1 = evtchn_from_port(d1, port1);
/* Guest cannot close a Xen-attached event channel. */
- if ( unlikely(consumer_is_xen(chn1)) )
+ if ( unlikely(consumer_is_xen(chn1)) && guest )
{
rc = -EINVAL;
goto out;
@@ -596,12 +597,6 @@ static long __evtchn_close(struct domain
return rc;
}
-
-static long evtchn_close(evtchn_close_t *close)
-{
- return __evtchn_close(current->domain, close->port);
-}
-
int evtchn_send(struct domain *d, unsigned int lport)
{
struct evtchn *lchn, *rchn;
@@ -956,7 +951,7 @@ static long evtchn_reset(evtchn_reset_t
goto out;
for ( i = 0; port_is_valid(d, i); i++ )
- (void)__evtchn_close(d, i);
+ evtchn_close(d, i, 1);
spin_lock(&d->event_lock);
@@ -1063,7 +1058,7 @@ long do_event_channel_op(int cmd, XEN_GU
struct evtchn_close close;
if ( copy_from_guest(&close, arg, 1) != 0 )
return -EFAULT;
- rc = evtchn_close(&close);
+ rc = evtchn_close(current->domain, close.port, 1);
break;
}
@@ -1193,11 +1188,10 @@ void free_xen_event_channel(
BUG_ON(!port_is_valid(d, port));
chn = evtchn_from_port(d, port);
BUG_ON(!consumer_is_xen(chn));
- chn->xen_consumer = 0;
spin_unlock(&d->event_lock);
- (void)__evtchn_close(d, port);
+ evtchn_close(d, port, 0);
}
@@ -1296,10 +1290,7 @@ void evtchn_destroy(struct domain *d)
/* Close all existing event channels. */
for ( i = 0; port_is_valid(d, i); i++ )
- {
- evtchn_from_port(d, i)->xen_consumer = 0;
- (void)__evtchn_close(d, i);
- }
+ evtchn_close(d, i, 0);
/* Free all event-channel buckets. */
spin_lock(&d->event_lock);

View File

@ -1,110 +0,0 @@
# Commit a753f0e53ff973a8a066e86c1cb3d6dd5c68d59f
# Date 2015-06-22 11:38:01 +0200
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
evtchn: defer freeing struct evtchn's until evtchn_destroy_final()
notify_via_xen_event_channel() and free_xen_event_channel() had to
check if the domain was dying because they may be called while the
domain is being destroyed and the struct evtchn's are being freed.
By deferring the freeing of the struct evtchn's until all references
to the domain are dropped, these functions can rely on the channel
state being present and valid.
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -1174,22 +1174,8 @@ int alloc_unbound_xen_event_channel(
void free_xen_event_channel(
struct vcpu *local_vcpu, int port)
{
- struct evtchn *chn;
struct domain *d = local_vcpu->domain;
-
- spin_lock(&d->event_lock);
-
- if ( unlikely(d->is_dying) )
- {
- spin_unlock(&d->event_lock);
- return;
- }
-
BUG_ON(!port_is_valid(d, port));
- chn = evtchn_from_port(d, port);
- BUG_ON(!consumer_is_xen(chn));
-
- spin_unlock(&d->event_lock);
evtchn_close(d, port, 0);
}
@@ -1203,18 +1189,12 @@ void notify_via_xen_event_channel(struct
spin_lock(&ld->event_lock);
- if ( unlikely(ld->is_dying) )
- {
- spin_unlock(&ld->event_lock);
- return;
- }
-
ASSERT(port_is_valid(ld, lport));
lchn = evtchn_from_port(ld, lport);
- ASSERT(consumer_is_xen(lchn));
if ( likely(lchn->state == ECS_INTERDOMAIN) )
{
+ ASSERT(consumer_is_xen(lchn));
rd = lchn->u.interdomain.remote_dom;
rport = lchn->u.interdomain.remote_port;
rchn = evtchn_from_port(rd, rport);
@@ -1282,7 +1262,7 @@ int evtchn_init(struct domain *d)
void evtchn_destroy(struct domain *d)
{
- unsigned int i, j;
+ unsigned int i;
/* After this barrier no new event-channel allocations can occur. */
BUG_ON(!d->is_dying);
@@ -1292,8 +1272,17 @@ void evtchn_destroy(struct domain *d)
for ( i = 0; port_is_valid(d, i); i++ )
evtchn_close(d, i, 0);
+ clear_global_virq_handlers(d);
+
+ evtchn_fifo_destroy(d);
+}
+
+
+void evtchn_destroy_final(struct domain *d)
+{
+ unsigned int i, j;
+
/* Free all event-channel buckets. */
- spin_lock(&d->event_lock);
for ( i = 0; i < NR_EVTCHN_GROUPS; i++ )
{
if ( !d->evtchn_group[i] )
@@ -1301,20 +1290,9 @@ void evtchn_destroy(struct domain *d)
for ( j = 0; j < BUCKETS_PER_GROUP; j++ )
free_evtchn_bucket(d, d->evtchn_group[i][j]);
xfree(d->evtchn_group[i]);
- d->evtchn_group[i] = NULL;
}
free_evtchn_bucket(d, d->evtchn);
- d->evtchn = NULL;
- spin_unlock(&d->event_lock);
- clear_global_virq_handlers(d);
-
- evtchn_fifo_destroy(d);
-}
-
-
-void evtchn_destroy_final(struct domain *d)
-{
#if MAX_VIRT_CPUS > BITS_PER_LONG
xfree(d->poll_mask);
d->poll_mask = NULL;

View File

@ -1,257 +0,0 @@
# Commit de6acb78bf0e137cbe5b72cee4a35ca018d759cc
# Date 2015-06-22 11:39:03 +0200
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
evtchn: use a per-event channel lock for sending events
When sending an event, use a new per-event channel lock to safely
validate the event channel state.
This new lock must be held when changing event channel state. Note
that the event channel lock must also be held when changing state from
ECS_FREE or it will race with a concurrent get_free_port() call.
To avoid having to take the remote event channel locks when sending to
an interdomain event channel, the local and remote channel locks are
both held when binding or closing an interdomain event channel.
This significantly increases the number of events that can be sent
from multiple VCPUs. But struct evtchn increases in size, reducing
the number that fit into a single page to 64 (instead of 128).
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -141,6 +141,7 @@ static struct evtchn *alloc_evtchn_bucke
return NULL;
}
chn[i].port = port + i;
+ spin_lock_init(&chn[i].lock);
}
return chn;
}
@@ -231,11 +232,15 @@ static long evtchn_alloc_unbound(evtchn_
if ( rc )
goto out;
+ spin_lock(&chn->lock);
+
chn->state = ECS_UNBOUND;
if ( (chn->u.unbound.remote_domid = alloc->remote_dom) == DOMID_SELF )
chn->u.unbound.remote_domid = current->domain->domain_id;
evtchn_port_init(d, chn);
+ spin_unlock(&chn->lock);
+
alloc->port = port;
out:
@@ -246,6 +251,28 @@ static long evtchn_alloc_unbound(evtchn_
}
+static void double_evtchn_lock(struct evtchn *lchn, struct evtchn *rchn)
+{
+ if ( lchn < rchn )
+ {
+ spin_lock(&lchn->lock);
+ spin_lock(&rchn->lock);
+ }
+ else
+ {
+ if ( lchn != rchn )
+ spin_lock(&rchn->lock);
+ spin_lock(&lchn->lock);
+ }
+}
+
+static void double_evtchn_unlock(struct evtchn *lchn, struct evtchn *rchn)
+{
+ spin_unlock(&lchn->lock);
+ if ( lchn != rchn )
+ spin_unlock(&rchn->lock);
+}
+
static long evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind)
{
struct evtchn *lchn, *rchn;
@@ -288,6 +315,8 @@ static long evtchn_bind_interdomain(evtc
if ( rc )
goto out;
+ double_evtchn_lock(lchn, rchn);
+
lchn->u.interdomain.remote_dom = rd;
lchn->u.interdomain.remote_port = rport;
lchn->state = ECS_INTERDOMAIN;
@@ -303,6 +332,8 @@ static long evtchn_bind_interdomain(evtc
*/
evtchn_set_pending(ld->vcpu[lchn->notify_vcpu_id], lport);
+ double_evtchn_unlock(lchn, rchn);
+
bind->local_port = lport;
out:
@@ -343,11 +374,16 @@ static long evtchn_bind_virq(evtchn_bind
ERROR_EXIT(port);
chn = evtchn_from_port(d, port);
+
+ spin_lock(&chn->lock);
+
chn->state = ECS_VIRQ;
chn->notify_vcpu_id = vcpu;
chn->u.virq = virq;
evtchn_port_init(d, chn);
+ spin_unlock(&chn->lock);
+
v->virq_to_evtchn[virq] = bind->port = port;
out:
@@ -374,10 +410,15 @@ static long evtchn_bind_ipi(evtchn_bind_
ERROR_EXIT(port);
chn = evtchn_from_port(d, port);
+
+ spin_lock(&chn->lock);
+
chn->state = ECS_IPI;
chn->notify_vcpu_id = vcpu;
evtchn_port_init(d, chn);
+ spin_unlock(&chn->lock);
+
bind->port = port;
out:
@@ -452,11 +493,15 @@ static long evtchn_bind_pirq(evtchn_bind
goto out;
}
+ spin_lock(&chn->lock);
+
chn->state = ECS_PIRQ;
chn->u.pirq.irq = pirq;
link_pirq_port(port, chn, v);
evtchn_port_init(d, chn);
+ spin_unlock(&chn->lock);
+
bind->port = port;
arch_evtchn_bind_pirq(d, pirq);
@@ -574,15 +619,24 @@ static long evtchn_close(struct domain *
BUG_ON(chn2->state != ECS_INTERDOMAIN);
BUG_ON(chn2->u.interdomain.remote_dom != d1);
+ double_evtchn_lock(chn1, chn2);
+
+ free_evtchn(d1, chn1);
+
chn2->state = ECS_UNBOUND;
chn2->u.unbound.remote_domid = d1->domain_id;
- break;
+
+ double_evtchn_unlock(chn1, chn2);
+
+ goto out;
default:
BUG();
}
+ spin_lock(&chn1->lock);
free_evtchn(d1, chn1);
+ spin_unlock(&chn1->lock);
out:
if ( d2 != NULL )
@@ -604,21 +658,18 @@ int evtchn_send(struct domain *d, unsign
struct vcpu *rvcpu;
int rport, ret = 0;
- spin_lock(&ld->event_lock);
-
- if ( unlikely(!port_is_valid(ld, lport)) )
- {
- spin_unlock(&ld->event_lock);
+ if ( !port_is_valid(ld, lport) )
return -EINVAL;
- }
lchn = evtchn_from_port(ld, lport);
+ spin_lock(&lchn->lock);
+
/* Guest cannot send via a Xen-attached event channel. */
if ( unlikely(consumer_is_xen(lchn)) )
{
- spin_unlock(&ld->event_lock);
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
ret = xsm_evtchn_send(XSM_HOOK, ld, lchn);
@@ -648,7 +699,7 @@ int evtchn_send(struct domain *d, unsign
}
out:
- spin_unlock(&ld->event_lock);
+ spin_unlock(&lchn->lock);
return ret;
}
@@ -1159,11 +1210,15 @@ int alloc_unbound_xen_event_channel(
if ( rc )
goto out;
+ spin_lock(&chn->lock);
+
chn->state = ECS_UNBOUND;
chn->xen_consumer = get_xen_consumer(notification_fn);
chn->notify_vcpu_id = local_vcpu->vcpu_id;
chn->u.unbound.remote_domid = remote_domid;
+ spin_unlock(&chn->lock);
+
out:
spin_unlock(&d->event_lock);
@@ -1187,11 +1242,11 @@ void notify_via_xen_event_channel(struct
struct domain *rd;
int rport;
- spin_lock(&ld->event_lock);
-
ASSERT(port_is_valid(ld, lport));
lchn = evtchn_from_port(ld, lport);
+ spin_lock(&lchn->lock);
+
if ( likely(lchn->state == ECS_INTERDOMAIN) )
{
ASSERT(consumer_is_xen(lchn));
@@ -1201,7 +1256,7 @@ void notify_via_xen_event_channel(struct
evtchn_set_pending(rd->vcpu[rchn->notify_vcpu_id], rport);
}
- spin_unlock(&ld->event_lock);
+ spin_unlock(&lchn->lock);
}
void evtchn_check_pollers(struct domain *d, unsigned int port)
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -79,6 +79,7 @@ extern domid_t hardware_domid;
struct evtchn
{
+ spinlock_t lock;
#define ECS_FREE 0 /* Channel is available for use. */
#define ECS_RESERVED 1 /* Channel is reserved. */
#define ECS_UNBOUND 2 /* Channel is waiting to bind to a remote domain. */

View File

@ -1,27 +0,0 @@
# Commit b58214a24231a1f2a7e09ae9cc3014eff752918b
# Date 2015-06-22 11:39:46 +0200
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
evtchn: pad struct evtchn to 64 bytes
The number of struct evtchn in a page must be a power of two. Under
some workloads performance is improved slightly by padding struct
evtchn to 64 bytes (a typical cache line size), thus putting the fewer
per-channel locks into each cache line.
This does not decrease the number of struct evtchn's per-page.
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -129,7 +129,7 @@ struct evtchn
#endif
} ssid;
#endif
-};
+} __attribute__((aligned(64)));
int evtchn_init(struct domain *d); /* from domain_create */
void evtchn_destroy(struct domain *d); /* from domain_kill */

View File

@ -1,23 +0,0 @@
# Commit 142473cfce41a565898e0fa33dc98a1f5e41abe4
# Date 2015-06-25 14:57:04 +0200
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/traps: avoid using current too early on boot
Early on boot, current has the sentinel value 0xfffff000. Blindly using it in
show_registers() causes a nested failure and no useful information printed
from an early crash.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -86,7 +86,7 @@ void show_registers(const struct cpu_use
struct cpu_user_regs fault_regs = *regs;
unsigned long fault_crs[8];
enum context context;
- struct vcpu *v = current;
+ struct vcpu *v = system_state >= SYS_STATE_smp_boot ? current : NULL;
if ( guest_mode(regs) && has_hvm_container_vcpu(v) )
{

View File

@ -1,50 +0,0 @@
# Commit 71bb7304e7a7a35ea6df4b0cedebc35028e4c159
# Date 2015-06-30 15:00:54 +0100
# Author Liang Li <liang.z.li@intel.com>
# Committer Ian Campbell <ian.campbell@citrix.com>
nested EPT: fix the handling of nested EPT
If the host EPT entry is changed, the nested EPT should be updated.
the current code does not do this, and it's wrong.
I have tested this patch, the L2 guest can boot and run as normal.
Signed-off-by: Liang Li <liang.z.li@intel.com>
Signed-off-by: Yang Zhang <yang.z.zhang@intel.com>
Reported-by: Tim Deegan <tim@xen.org>
Reviewed-by: Tim Deegan <tim@xen.org>
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -26,6 +26,7 @@
#include <asm/p2m.h>
#include <asm/hvm/vmx/vmx.h>
#include <asm/hvm/vmx/vmcs.h>
+#include <asm/hvm/nestedhvm.h>
#include <xen/iommu.h>
#include <asm/mtrr.h>
#include <asm/hvm/cacheattr.h>
@@ -1040,6 +1041,9 @@ void ept_sync_domain(struct p2m_domain *
ASSERT(local_irq_is_enabled());
+ if ( nestedhvm_enabled(d) && !p2m_is_nestedp2m(p2m) )
+ p2m_flush_nestedp2m(d);
+
/*
* Flush active cpus synchronously. Flush others the next time this domain
* is scheduled onto them. We accept the race of other CPUs adding to
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -1713,6 +1713,12 @@ p2m_flush_table(struct p2m_domain *p2m)
ASSERT(page_list_empty(&p2m->pod.super));
ASSERT(page_list_empty(&p2m->pod.single));
+ if ( p2m->np2m_base == P2M_BASE_EADDR )
+ {
+ p2m_unlock(p2m);
+ return;
+ }
+
/* This is no longer a valid nested p2m for any address space */
p2m->np2m_base = P2M_BASE_EADDR;

View File

@ -1,64 +0,0 @@
# Commit e4e9d2d4e76bd8fe229c124bd57fc6ba824271b3
# Date 2015-07-07 11:37:26 +0200
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/p2m-ept: don't unmap the EPT pagetable while it is still in use
The call to iommu_pte_flush() between the two hunks uses &ept_entry->epte
which is a pointer into the mapped page.
It is eventually passed to `clflush` instruction which will suffer a pagefault
if the virtual mapping has fallen out of the TLB.
(XEN) ----[ Xen-4.5.0-xs102594-d x86_64 debug=y Not tainted ]----
(XEN) CPU: 7
(XEN) RIP: e008:[<ffff82d0801572f0>] cacheline_flush+0x4/0x9
<snip>
(XEN) Xen call trace:
(XEN) [<ffff82d0801572f0>] cacheline_flush+0x4/0x9
(XEN) [<ffff82d08014ffff>] __iommu_flush_cache+0x4a/0x6a
(XEN) [<ffff82d0801532e2>] iommu_pte_flush+0x2b/0xd5
(XEN) [<ffff82d0801f909a>] ept_set_entry+0x4bc/0x61f
(XEN) [<ffff82d0801f0c25>] p2m_set_entry+0xd1/0x112
(XEN) [<ffff82d0801f25b1>] clear_mmio_p2m_entry+0x1a0/0x200
(XEN) [<ffff82d0801f4aac>] unmap_mmio_regions+0x49/0x73
(XEN) [<ffff82d080106292>] do_domctl+0x15bd/0x1edb
(XEN) [<ffff82d080234fcb>] syscall_enter+0xeb/0x145
(XEN)
(XEN) Pagetable walk from ffff820040004ae0:
(XEN) L4[0x104] = 00000008668a5063 ffffffffffffffff
(XEN) L3[0x001] = 00000008668a3063 ffffffffffffffff
(XEN) L2[0x000] = 000000086689c063 ffffffffffffffff
(XEN) L1[0x004] = 000000056f078063 000000000007f678
(XEN)
(XEN) ****************************************
(XEN) Panic on CPU 7:
(XEN) FATAL PAGE FAULT
(XEN) [error_code=0000]
(XEN) Faulting linear address: ffff820040004ae0
(XEN) ****************************************
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: George Dunlap <george.dunlap@eu.citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -764,8 +764,6 @@ ept_set_entry(struct p2m_domain *p2m, un
p2m->max_mapped_pfn = gfn + (1UL << order) - 1;
out:
- unmap_domain_page(table);
-
if ( needs_sync != sync_off )
ept_sync_domain(p2m);
@@ -788,6 +786,8 @@ out:
}
}
+ unmap_domain_page(table);
+
/* Release the old intermediate tables, if any. This has to be the
last thing we do, after the ept_sync_domain() and removal
from the iommu tables, so as to avoid a potential

View File

@ -1,88 +0,0 @@
# Commit 8022b05284dea80e24813d03180788ec7277a0bd
# Date 2015-07-07 14:29:39 +0200
# Author Dario Faggioli <dario.faggioli@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86 / cpupool: clear the proper cpu_valid bit on pCPU teardown
In fact, when a pCPU goes down, we want to clear its
bit in the correct cpupool's valid mask, rather than
always in cpupool0's one.
Before this commit, all the pCPUs in the non-default
pool(s) will be considered immediately valid, during
system resume, even the one that have not been brought
up yet. As a result, the (Credit1) scheduler will attempt
to run its load balancing logic on them, causing the
following Oops:
# xl cpupool-cpu-remove Pool-0 8-15
# xl cpupool-create name=\"Pool-1\"
# xl cpupool-cpu-add Pool-1 8-15
--> suspend
--> resume
(XEN) ----[ Xen-4.6-unstable x86_64 debug=y Tainted: C ]----
(XEN) CPU: 8
(XEN) RIP: e008:[<ffff82d080123078>] csched_schedule+0x4be/0xb97
(XEN) RFLAGS: 0000000000010087 CONTEXT: hypervisor
(XEN) rax: 80007d2f7fccb780 rbx: 0000000000000009 rcx: 0000000000000000
(XEN) rdx: ffff82d08031ed40 rsi: ffff82d080334980 rdi: 0000000000000000
(XEN) rbp: ffff83010000fe20 rsp: ffff83010000fd40 r8: 0000000000000004
(XEN) r9: 0000ffff0000ffff r10: 00ff00ff00ff00ff r11: 0f0f0f0f0f0f0f0f
(XEN) r12: ffff8303191ea870 r13: ffff8303226aadf0 r14: 0000000000000009
(XEN) r15: 0000000000000008 cr0: 000000008005003b cr4: 00000000000026f0
(XEN) cr3: 00000000dba9d000 cr2: 0000000000000000
(XEN) ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: 0000 cs: e008
(XEN) ... ... ...
(XEN) Xen call trace:
(XEN) [<ffff82d080123078>] csched_schedule+0x4be/0xb97
(XEN) [<ffff82d08012c732>] schedule+0x12a/0x63c
(XEN) [<ffff82d08012f8c8>] __do_softirq+0x82/0x8d
(XEN) [<ffff82d08012f920>] do_softirq+0x13/0x15
(XEN) [<ffff82d080164791>] idle_loop+0x5b/0x6b
(XEN)
(XEN) ****************************************
(XEN) Panic on CPU 8:
(XEN) GENERAL PROTECTION FAULT
(XEN) [error_code=0000]
(XEN) ****************************************
The reason why the error is a #GP fault is that, without
this commit, we try to access the per-cpu area of a not
yet allocated and initialized pCPU.
In fact, %rax, which is what is used as pointer, is
80007d2f7fccb780, and we also have this:
#define INVALID_PERCPU_AREA (0x8000000000000000L - (long)__per_cpu_start)
Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
Acked-by: Juergen Gross <jgross@suse.com>
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -816,7 +816,6 @@ void __cpu_disable(void)
remove_siblinginfo(cpu);
/* It's now safe to remove this processor from the online map */
- cpumask_clear_cpu(cpu, cpupool0->cpu_valid);
cpumask_clear_cpu(cpu, &cpu_online_map);
fixup_irqs();
--- a/xen/common/cpupool.c
+++ b/xen/common/cpupool.c
@@ -529,6 +529,7 @@ static int cpupool_cpu_remove(unsigned i
if ( cpumask_test_cpu(cpu, (*c)->cpu_valid ) )
{
cpumask_set_cpu(cpu, (*c)->cpu_suspended);
+ cpumask_clear_cpu(cpu, (*c)->cpu_valid);
break;
}
}
@@ -551,6 +552,7 @@ static int cpupool_cpu_remove(unsigned i
* If we are not suspending, we are hot-unplugging cpu, and that is
* allowed only for CPUs in pool0.
*/
+ cpumask_clear_cpu(cpu, cpupool0->cpu_valid);
ret = 0;
}

View File

@ -1,141 +0,0 @@
# Commit 02ea5031825d984d52eb9a982b8457e3434137f0
# Date 2015-07-07 14:30:06 +0200
# Author Dario Faggioli <dario.faggioli@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
credit1: properly deal with pCPUs not in any cpupool
Ideally, the pCPUs that are 'free', i.e., not assigned
to any cpupool, should not be considred by the scheduler
for load balancing or anything. In Credit1, we fail at
this, because of how we use cpupool_scheduler_cpumask().
In fact, for a free pCPU, cpupool_scheduler_cpumask()
returns a pointer to cpupool_free_cpus, and hence, near
the top of csched_load_balance():
if ( unlikely(!cpumask_test_cpu(cpu, online)) )
goto out;
is false (the pCPU _is_ free!), and we therefore do not
jump to the end right away, as we should. This, causes
the following splat when resuming from ACPI S3 with
pCPUs not assigned to any pool:
(XEN) ----[ Xen-4.6-unstable x86_64 debug=y Tainted: C ]----
(XEN) ... ... ...
(XEN) Xen call trace:
(XEN) [<ffff82d080122eaa>] csched_load_balance+0x213/0x794
(XEN) [<ffff82d08012374c>] csched_schedule+0x321/0x452
(XEN) [<ffff82d08012c85e>] schedule+0x12a/0x63c
(XEN) [<ffff82d08012fa09>] __do_softirq+0x82/0x8d
(XEN) [<ffff82d08012fa61>] do_softirq+0x13/0x15
(XEN) [<ffff82d080164780>] idle_loop+0x5b/0x6b
(XEN)
(XEN)
(XEN) ****************************************
(XEN) Panic on CPU 8:
(XEN) GENERAL PROTECTION FAULT
(XEN) [error_code=0000]
(XEN) ****************************************
The cure is:
* use cpupool_online_cpumask(), as a better guard to the
case when the cpu is being offlined;
* explicitly check whether the cpu is free.
SEDF is in a similar situation, so fix it too.
Still in Credit1, we must make sure that free (or offline)
CPUs are not considered "ticklable". Not doing so would impair
the load balancing algorithm, making the scheduler think that
it is possible to 'ask' the pCPU to pick up some work, while
in reallity, that will never happen! Evidence of such behavior
is shown in this trace:
Name CPU list
Pool-0 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0.112998198 | ||.|| -|x||-|- d0v0 runstate_change d0v4 offline->runnable
] 0.112998198 | ||.|| -|x||-|- d0v0 22006(2:2:6) 1 [ f ]
] 0.112999612 | ||.|| -|x||-|- d0v0 28004(2:8:4) 2 [ 0 4 ]
0.113003387 | ||.|| -||||-|x d32767v15 runstate_continue d32767v15 running->running
where "22006(2:2:6) 1 [ f ]" means that pCPU 15, which is
free from any pool, is tickled.
The cure, in this case, is to filter out the free pCPUs,
within __runq_tickle().
Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
Acked-by: Juergen Gross <jgross@suse.com>
Reviewed-by: George Dunlap <george.dunlap@eu.citrix.com>
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -350,12 +350,17 @@ __runq_tickle(unsigned int cpu, struct c
{
struct csched_vcpu * const cur = CSCHED_VCPU(curr_on_cpu(cpu));
struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu));
- cpumask_t mask, idle_mask;
+ cpumask_t mask, idle_mask, *online;
int balance_step, idlers_empty;
ASSERT(cur);
cpumask_clear(&mask);
- idlers_empty = cpumask_empty(prv->idlers);
+
+ /* cpu is vc->processor, so it must be in a cpupool. */
+ ASSERT(per_cpu(cpupool, cpu) != NULL);
+ online = cpupool_online_cpumask(per_cpu(cpupool, cpu));
+ cpumask_and(&idle_mask, prv->idlers, online);
+ idlers_empty = cpumask_empty(&idle_mask);
/*
@@ -392,8 +397,8 @@ __runq_tickle(unsigned int cpu, struct c
/* Are there idlers suitable for new (for this balance step)? */
csched_balance_cpumask(new->vcpu, balance_step,
csched_balance_mask);
- cpumask_and(&idle_mask, prv->idlers, csched_balance_mask);
- new_idlers_empty = cpumask_empty(&idle_mask);
+ cpumask_and(csched_balance_mask, csched_balance_mask, &idle_mask);
+ new_idlers_empty = cpumask_empty(csched_balance_mask);
/*
* Let's not be too harsh! If there aren't idlers suitable
@@ -1494,6 +1499,7 @@ static struct csched_vcpu *
csched_load_balance(struct csched_private *prv, int cpu,
struct csched_vcpu *snext, bool_t *stolen)
{
+ struct cpupool *c = per_cpu(cpupool, cpu);
struct csched_vcpu *speer;
cpumask_t workers;
cpumask_t *online;
@@ -1501,10 +1507,13 @@ csched_load_balance(struct csched_privat
int node = cpu_to_node(cpu);
BUG_ON( cpu != snext->vcpu->processor );
- online = cpupool_scheduler_cpumask(per_cpu(cpupool, cpu));
+ online = cpupool_online_cpumask(c);
- /* If this CPU is going offline we shouldn't steal work. */
- if ( unlikely(!cpumask_test_cpu(cpu, online)) )
+ /*
+ * If this CPU is going offline, or is not (yet) part of any cpupool
+ * (as it happens, e.g., during cpu bringup), we shouldn't steal work.
+ */
+ if ( unlikely(!cpumask_test_cpu(cpu, online) || c == NULL) )
goto out;
if ( snext->pri == CSCHED_PRI_IDLE )
--- a/xen/common/sched_sedf.c
+++ b/xen/common/sched_sedf.c
@@ -791,7 +791,8 @@ static struct task_slice sedf_do_schedul
if ( tasklet_work_scheduled ||
(list_empty(runq) && list_empty(waitq)) ||
unlikely(!cpumask_test_cpu(cpu,
- cpupool_scheduler_cpumask(per_cpu(cpupool, cpu)))) )
+ cpupool_online_cpumask(per_cpu(cpupool, cpu))) ||
+ per_cpu(cpupool, cpu) == NULL) )
{
ret.task = IDLETASK(cpu);
ret.time = SECONDS(1);

View File

@ -1,68 +0,0 @@
# Commit bbbe7e7157a964c485fb861765be291734676932
# Date 2015-07-07 14:39:27 +0200
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/hvmloader: avoid data corruption with xenstore reads/writes
The functions ring_read and ring_write() have logic to try and deal with
partial reads and writes.
However, in all cases where the "while (len)" loop executed twice, data
corruption would occur as the second memcpy() starts from the beginning of
"data" again, rather than from where it got to.
This bug manifested itself as protocol corruption when a reply header crossed
the first wrap of the response ring. However, similar corruption would also
occur if hvmloader observed xenstored performing partial writes of the block
in question, or if hvmloader had to wait for xenstored to make space in either
ring.
Reported-by: Adam Kucia <djexit@o2.pl>
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/tools/firmware/hvmloader/xenbus.c
+++ b/tools/firmware/hvmloader/xenbus.c
@@ -105,7 +105,7 @@ void xenbus_shutdown(void)
/* Helper functions: copy data in and out of the ring */
static void ring_write(const char *data, uint32_t len)
{
- uint32_t part;
+ uint32_t part, done = 0;
ASSERT(len <= XENSTORE_PAYLOAD_MAX);
@@ -122,16 +122,18 @@ static void ring_write(const char *data,
if ( part > len )
part = len;
- memcpy(rings->req + MASK_XENSTORE_IDX(rings->req_prod), data, part);
+ memcpy(rings->req + MASK_XENSTORE_IDX(rings->req_prod),
+ data + done, part);
barrier(); /* = wmb before prod write, rmb before next cons read */
rings->req_prod += part;
len -= part;
+ done += part;
}
}
static void ring_read(char *data, uint32_t len)
{
- uint32_t part;
+ uint32_t part, done = 0;
ASSERT(len <= XENSTORE_PAYLOAD_MAX);
@@ -148,10 +150,12 @@ static void ring_read(char *data, uint32
if ( part > len )
part = len;
- memcpy(data, rings->rsp + MASK_XENSTORE_IDX(rings->rsp_cons), part);
+ memcpy(data + done,
+ rings->rsp + MASK_XENSTORE_IDX(rings->rsp_cons), part);
barrier(); /* = wmb before cons write, rmb before next prod read */
rings->rsp_cons += part;
len -= part;
+ done += part;
}
}

View File

@ -1,102 +0,0 @@
# Commit 39c6664a0e6e1b4ed80660d545dff34ce41bee31
# Date 2015-07-07 15:10:45 +0100
# Author Ian Campbell <ian.campbell@citrix.com>
# Committer Ian Campbell <ian.campbell@citrix.com>
xen: earlycpio: Pull in latest linux earlycpio.[ch]
AFAICT our current version does not correspond to any version in the
Linux history. This commit resynchronised to the state in Linux
commit 598bae70c2a8e35c8d39b610cca2b32afcf047af.
Differences from upstream: find_cpio_data is __init, printk instead of
pr_*.
This appears to fix Debian bug #785187. "Appears" because my test box
happens to be AMD and the issue is that the (valid) cpio generated by
the Intel ucode is not liked by the old Xen code. I've tested by
hacking the hypervisor to look for the Intel path.
Reported-by: Stephan Seitz <stse+debianbugs@fsing.rootsland.net>
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Stephan Seitz <stse+debianbugs@fsing.rootsland.net>
Cc: 785187@bugs.debian.org
Acked-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/common/earlycpio.c
+++ b/xen/common/earlycpio.c
@@ -54,25 +54,26 @@ enum cpio_fields {
/**
* cpio_data find_cpio_data - Search for files in an uncompressed cpio
- * @path: The directory to search for, including a slash at the end
- * @data: Pointer to the the cpio archive or a header inside
- * @len: Remaining length of the cpio based on data pointer
- * @offset: When a matching file is found, this is the offset to the
- * beginning of the cpio. It can be used to iterate through
- * the cpio to find all files inside of a directory path
+ * @path: The directory to search for, including a slash at the end
+ * @data: Pointer to the the cpio archive or a header inside
+ * @len: Remaining length of the cpio based on data pointer
+ * @nextoff: When a matching file is found, this is the offset from the
+ * beginning of the cpio to the beginning of the next file, not the
+ * matching file itself. It can be used to iterate through the cpio
+ * to find all files inside of a directory path.
*
- * @return: struct cpio_data containing the address, length and
- * filename (with the directory path cut off) of the found file.
- * If you search for a filename and not for files in a directory,
- * pass the absolute path of the filename in the cpio and make sure
- * the match returned an empty filename string.
+ * @return: struct cpio_data containing the address, length and
+ * filename (with the directory path cut off) of the found file.
+ * If you search for a filename and not for files in a directory,
+ * pass the absolute path of the filename in the cpio and make sure
+ * the match returned an empty filename string.
*/
struct cpio_data __init find_cpio_data(const char *path, void *data,
- size_t len, long *offset)
+ size_t len, long *nextoff)
{
const size_t cpio_header_len = 8*C_NFIELDS - 2;
- struct cpio_data cd = { NULL, 0 };
+ struct cpio_data cd = { NULL, 0, "" };
const char *p, *dptr, *nptr;
unsigned int ch[C_NFIELDS], *chp, v;
unsigned char c, x;
@@ -129,17 +130,17 @@ struct cpio_data __init find_cpio_data(c
if ((ch[C_MODE] & 0170000) == 0100000 &&
ch[C_NAMESIZE] >= mypathsize &&
!memcmp(p, path, mypathsize)) {
- *offset = (long)nptr - (long)data;
+ *nextoff = (long)nptr - (long)data;
if (ch[C_NAMESIZE] - mypathsize >= MAX_CPIO_FILE_NAME) {
printk(
"File %s exceeding MAX_CPIO_FILE_NAME [%d]\n",
p, MAX_CPIO_FILE_NAME);
}
- if (ch[C_NAMESIZE] - 1 /* includes \0 */ == mypathsize) {
- cd.data = (void *)dptr;
- cd.size = ch[C_FILESIZE];
- return cd; /* Found it! */
- }
+ strlcpy(cd.name, p + mypathsize, MAX_CPIO_FILE_NAME);
+
+ cd.data = (void *)dptr;
+ cd.size = ch[C_FILESIZE];
+ return cd; /* Found it! */
}
len -= (nptr - p);
p = nptr;
--- a/xen/include/xen/earlycpio.h
+++ b/xen/include/xen/earlycpio.h
@@ -6,6 +6,7 @@
struct cpio_data {
void *data;
size_t size;
+ char name[MAX_CPIO_FILE_NAME];
};
struct cpio_data find_cpio_data(const char *path, void *data, size_t len,

View File

@ -1,37 +0,0 @@
Subject: xl: correct handling of extra_config in main_cpupoolcreate
From: Wei Liu wei.liu2@citrix.com Tue Jul 14 17:41:10 2015 +0100
Date: Wed Jul 15 10:58:08 2015 +0100:
Git: 705c9e12426cba82804cb578fc70785281655d94
Don't dereference extra_config if it's NULL. Don't leak extra_config in
the end.
Also fixed a typo in error string while I was there.
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
Acked-by: Ian Jackson <ian.jackson@eu.citrix.com>
Index: xen-4.5.1-testing/tools/libxl/xl_cmdimpl.c
===================================================================
--- xen-4.5.1-testing.orig/tools/libxl/xl_cmdimpl.c
+++ xen-4.5.1-testing/tools/libxl/xl_cmdimpl.c
@@ -7085,9 +7085,9 @@ int main_cpupoolcreate(int argc, char **
else
config_src="command line";
- if (strlen(extra_config)) {
+ if (extra_config && strlen(extra_config)) {
if (config_len > INT_MAX - (strlen(extra_config) + 2)) {
- fprintf(stderr, "Failed to attach extra configration\n");
+ fprintf(stderr, "Failed to attach extra configuration\n");
goto out;
}
config_data = xrealloc(config_data,
@@ -7211,6 +7211,7 @@ out_cfg:
out:
free(name);
free(config_data);
+ free(extra_config);
return rc;
}

View File

@ -1,24 +0,0 @@
# Commit b1c780cd315eb4db06be3bbb5c6d80b1cabd27a9
# Date 2015-07-15 16:11:42 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
make rangeset_report_ranges() report all ranges
find_range() returns NULL when s is below the lowest range, so we have
to use first_range() here (which is as good performance wise), or else
no range gets reported at all in that case.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
--- a/xen/common/rangeset.c
+++ b/xen/common/rangeset.c
@@ -289,7 +289,7 @@ int rangeset_report_ranges(
read_lock(&r->lock);
- for ( x = find_range(r, s); x && (x->s <= e) && !rc; x = next_range(r, x) )
+ for ( x = first_range(r); x && (x->s <= e) && !rc; x = next_range(r, x) )
if ( x->e >= s )
rc = cb(max(x->s, s), min(x->e, e), ctxt);

View File

@ -1,135 +0,0 @@
# Commit a8bc99b981c5ad773bd646f5986e616d26fb94d7
# Date 2015-07-16 11:50:07 +0200
# Author Elena Ufimtseva <elena.ufimtseva@oracle.com>
# Committer Jan Beulich <jbeulich@suse.com>
dmar: device scope mem leak fix
Release memory allocated for scope.devices dmar units on various
failure paths and when disabling dmar. Set device count after
sucessfull memory allocation, not before, in device scope parsing function.
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Yang Zhang <yang.z.zhang@intel.com>
# Commit 132231d10343608faf5892785a08acc500326d04
# Date 2015-07-16 15:23:37 +0200
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
dmar: fix double free in error paths following c/s a8bc99b
Several error paths would end up freeing scope->devices twice.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/drivers/passthrough/vtd/dmar.c
+++ b/xen/drivers/passthrough/vtd/dmar.c
@@ -80,6 +80,16 @@ static int __init acpi_register_rmrr_uni
return 0;
}
+static void scope_devices_free(struct dmar_scope *scope)
+{
+ if ( !scope )
+ return;
+
+ scope->devices_cnt = 0;
+ xfree(scope->devices);
+ scope->devices = NULL;
+}
+
static void __init disable_all_dmar_units(void)
{
struct acpi_drhd_unit *drhd, *_drhd;
@@ -89,16 +99,19 @@ static void __init disable_all_dmar_unit
list_for_each_entry_safe ( drhd, _drhd, &acpi_drhd_units, list )
{
list_del(&drhd->list);
+ scope_devices_free(&drhd->scope);
xfree(drhd);
}
list_for_each_entry_safe ( rmrr, _rmrr, &acpi_rmrr_units, list )
{
list_del(&rmrr->list);
+ scope_devices_free(&rmrr->scope);
xfree(rmrr);
}
list_for_each_entry_safe ( atsr, _atsr, &acpi_atsr_units, list )
{
list_del(&atsr->list);
+ scope_devices_free(&atsr->scope);
xfree(atsr);
}
}
@@ -317,13 +330,13 @@ static int __init acpi_parse_dev_scope(
if ( (cnt = scope_device_count(start, end)) < 0 )
return cnt;
- scope->devices_cnt = cnt;
if ( cnt > 0 )
{
scope->devices = xzalloc_array(u16, cnt);
if ( !scope->devices )
return -ENOMEM;
}
+ scope->devices_cnt = cnt;
while ( start < end )
{
@@ -426,7 +439,7 @@ static int __init acpi_parse_dev_scope(
out:
if ( ret )
- xfree(scope->devices);
+ scope_devices_free(scope);
return ret;
}
@@ -541,6 +554,7 @@ acpi_parse_one_drhd(struct acpi_dmar_hea
" Workaround BIOS bug: ignore the DRHD due to all "
"devices under its scope are not PCI discoverable!\n");
+ scope_devices_free(&dmaru->scope);
iommu_free(dmaru);
xfree(dmaru);
}
@@ -561,9 +575,11 @@ acpi_parse_one_drhd(struct acpi_dmar_hea
out:
if ( ret )
{
+ scope_devices_free(&dmaru->scope);
iommu_free(dmaru);
xfree(dmaru);
}
+
return ret;
}
@@ -657,6 +673,7 @@ acpi_parse_one_rmrr(struct acpi_dmar_hea
" Ignore the RMRR (%"PRIx64", %"PRIx64") due to "
"devices under its scope are not PCI discoverable!\n",
rmrru->base_address, rmrru->end_address);
+ scope_devices_free(&rmrru->scope);
xfree(rmrru);
}
else if ( base_addr > end_addr )
@@ -664,6 +681,7 @@ acpi_parse_one_rmrr(struct acpi_dmar_hea
dprintk(XENLOG_WARNING VTDPREFIX,
" The RMRR (%"PRIx64", %"PRIx64") is incorrect!\n",
rmrru->base_address, rmrru->end_address);
+ scope_devices_free(&rmrru->scope);
xfree(rmrru);
ret = -EFAULT;
}
@@ -726,7 +744,10 @@ acpi_parse_one_atsr(struct acpi_dmar_hea
}
if ( ret )
+ {
+ scope_devices_free(&atsru->scope);
xfree(atsru);
+ }
else
acpi_register_atsr_unit(atsru);
return ret;

View File

@ -1,120 +0,0 @@
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
# Commit a88b72fddd046a0978242411276861039ec99ad0
# Date 2015-07-23 10:13:12 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/PCI: add config space abstract write intercept logic
This is to be used by MSI code, and later to also be hooked up to
MMCFG accesses by Dom0.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -1108,6 +1108,12 @@ void pci_cleanup_msi(struct pci_dev *pde
msi_free_irqs(pdev);
}
+int pci_msi_conf_write_intercept(struct pci_dev *pdev, unsigned int reg,
+ unsigned int size, uint32_t *data)
+{
+ return 0;
+}
+
int pci_restore_msi_state(struct pci_dev *pdev)
{
unsigned long flags;
--- a/xen/arch/x86/pci.c
+++ b/xen/arch/x86/pci.c
@@ -67,3 +67,28 @@ void pci_conf_write(uint32_t cf8, uint8_
spin_unlock_irqrestore(&pci_config_lock, flags);
}
+
+int pci_conf_write_intercept(unsigned int seg, unsigned int bdf,
+ unsigned int reg, unsigned int size,
+ uint32_t *data)
+{
+ struct pci_dev *pdev;
+ int rc = 0;
+
+ /*
+ * Avoid expensive operations when no hook is going to do anything
+ * for the access anyway.
+ */
+ if ( reg < 64 || reg >= 256 )
+ return 0;
+
+ spin_lock(&pcidevs_lock);
+
+ pdev = pci_get_pdev(seg, PCI_BUS(bdf), PCI_DEVFN2(bdf));
+ if ( pdev )
+ rc = pci_msi_conf_write_intercept(pdev, reg, size, data);
+
+ spin_unlock(&pcidevs_lock);
+
+ return rc;
+}
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -1708,8 +1708,8 @@ static int admin_io_okay(
return ioports_access_permitted(v->domain, port, port + bytes - 1);
}
-static bool_t pci_cfg_ok(struct domain *currd, bool_t write,
- unsigned int start, unsigned int size)
+static bool_t pci_cfg_ok(struct domain *currd, unsigned int start,
+ unsigned int size, uint32_t *write)
{
uint32_t machine_bdf;
@@ -1741,8 +1741,12 @@ static bool_t pci_cfg_ok(struct domain *
start |= CF8_ADDR_HI(currd->arch.pci_cf8);
}
- return !xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf,
- start, start + size - 1, write);
+ if ( xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf,
+ start, start + size - 1, !!write) != 0 )
+ return 0;
+
+ return !write ||
+ pci_conf_write_intercept(0, machine_bdf, start, size, write) >= 0;
}
uint32_t guest_io_read(
@@ -1796,7 +1800,7 @@ uint32_t guest_io_read(
size = min(bytes, 4 - (port & 3));
if ( size == 3 )
size = 2;
- if ( pci_cfg_ok(v->domain, 0, port & 3, size) )
+ if ( pci_cfg_ok(v->domain, port & 3, size, NULL) )
sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size);
}
@@ -1869,7 +1873,7 @@ void guest_io_write(
size = min(bytes, 4 - (port & 3));
if ( size == 3 )
size = 2;
- if ( pci_cfg_ok(v->domain, 1, port & 3, size) )
+ if ( pci_cfg_ok(v->domain, port & 3, size, &data) )
pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data);
}
--- a/xen/include/asm-x86/pci.h
+++ b/xen/include/asm-x86/pci.h
@@ -15,4 +15,11 @@ struct arch_pci_dev {
vmask_t used_vectors;
};
+struct pci_dev;
+int pci_conf_write_intercept(unsigned int seg, unsigned int bdf,
+ unsigned int reg, unsigned int size,
+ uint32_t *data);
+int pci_msi_conf_write_intercept(struct pci_dev *, unsigned int reg,
+ unsigned int size, uint32_t *data);
+
#endif /* __X86_PCI_H__ */

View File

@ -1,75 +0,0 @@
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
# Commit 484d7c852e4ff79c945406ed28b5db63a5a0b7f3
# Date 2015-07-23 10:14:13 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/MSI-X: track host and guest mask-all requests separately
Host uses of the bits will be added subsequently, and must not be
overridden by guests (including Dom0, namely when acting on behalf of
a guest).
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -843,6 +843,12 @@ static int msix_capability_init(struct p
if ( !msix->used_entries )
{
+ msix->host_maskall = 0;
+ if ( !msix->guest_maskall )
+ control &= ~PCI_MSIX_FLAGS_MASKALL;
+ else
+ control |= PCI_MSIX_FLAGS_MASKALL;
+
if ( rangeset_add_range(mmio_ro_ranges, msix->table.first,
msix->table.last) )
WARN();
@@ -1111,6 +1117,34 @@ void pci_cleanup_msi(struct pci_dev *pde
int pci_msi_conf_write_intercept(struct pci_dev *pdev, unsigned int reg,
unsigned int size, uint32_t *data)
{
+ u16 seg = pdev->seg;
+ u8 bus = pdev->bus;
+ u8 slot = PCI_SLOT(pdev->devfn);
+ u8 func = PCI_FUNC(pdev->devfn);
+ struct msi_desc *entry;
+ unsigned int pos;
+
+ if ( pdev->msix )
+ {
+ entry = find_msi_entry(pdev, -1, PCI_CAP_ID_MSIX);
+ pos = entry ? entry->msi_attrib.pos
+ : pci_find_cap_offset(seg, bus, slot, func,
+ PCI_CAP_ID_MSIX);
+ ASSERT(pos);
+
+ if ( reg < pos || reg >= msix_pba_offset_reg(pos) + 4 )
+ return 0;
+
+ if ( reg != msix_control_reg(pos) || size != 2 )
+ return -EACCES;
+
+ pdev->msix->guest_maskall = !!(*data & PCI_MSIX_FLAGS_MASKALL);
+ if ( pdev->msix->host_maskall )
+ *data |= PCI_MSIX_FLAGS_MASKALL;
+
+ return 1;
+ }
+
return 0;
}
--- a/xen/include/asm-x86/msi.h
+++ b/xen/include/asm-x86/msi.h
@@ -228,6 +228,7 @@ struct arch_msix {
int table_refcnt[MAX_MSIX_TABLE_PAGES];
int table_idx[MAX_MSIX_TABLE_PAGES];
spinlock_t table_lock;
+ bool_t host_maskall, guest_maskall;
domid_t warned;
};

View File

@ -1,351 +0,0 @@
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
# Commit 082fdc6ce85e5b603f8fb24553cf200e3b67889f
# Date 2015-07-23 10:14:59 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/MSI-X: be more careful during teardown
When a device gets detached from a guest, pciback will clear its
command register, thus disabling both memory and I/O decoding. The
disabled memory decoding, however, has an effect on the MSI-X table
accesses the hypervisor does: These won't have the intended effect
anymore. Even worse, for PCIe devices (but not SR-IOV virtual
functions) such accesses may (will?) be treated as Unsupported
Requests, causing respective errors to be surfaced, potentially in the
form of NMIs that may be fatal to the hypervisor or Dom0 is different
ways. Hence rather than carrying out these accesses, we should avoid
them where we can, and use alternative (e.g. PCI config space based)
mechanisms to achieve at least the same effect.
At this time it continues to be unclear whether this is fixing an
actual bug or is rather just working around bogus (but apparently
common) system behavior.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
Backporting note (largely to myself):
Depends on (not yet backported to 4.4 and earlier) commit 061eebe0e
"x86/MSI: drop workaround for insecure Dom0 kernels" (due to re-use
of struct arch_msix's warned field).
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -217,9 +217,9 @@ void destroy_irq(unsigned int irq)
}
spin_lock_irqsave(&desc->lock, flags);
- desc->status |= IRQ_DISABLED;
desc->status &= ~IRQ_GUEST;
desc->handler->shutdown(desc);
+ desc->status |= IRQ_DISABLED;
action = desc->action;
desc->action = NULL;
desc->msi_desc = NULL;
@@ -995,8 +995,8 @@ void __init release_irq(unsigned int irq
spin_lock_irqsave(&desc->lock,flags);
action = desc->action;
desc->action = NULL;
- desc->status |= IRQ_DISABLED;
desc->handler->shutdown(desc);
+ desc->status |= IRQ_DISABLED;
spin_unlock_irqrestore(&desc->lock,flags);
/* Wait to make sure it's not being used on another CPU */
@@ -1732,8 +1732,8 @@ static irq_guest_action_t *__pirq_guest_
BUG_ON(action->in_flight != 0);
/* Disabling IRQ before releasing the desc_lock avoids an IRQ storm. */
- desc->status |= IRQ_DISABLED;
desc->handler->disable(desc);
+ desc->status |= IRQ_DISABLED;
/*
* Mark any remaining pending EOIs as ready to flush.
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -123,6 +123,27 @@ static void msix_put_fixmap(struct arch_
spin_unlock(&msix->table_lock);
}
+static bool_t memory_decoded(const struct pci_dev *dev)
+{
+ u8 bus, slot, func;
+
+ if ( !dev->info.is_virtfn )
+ {
+ bus = dev->bus;
+ slot = PCI_SLOT(dev->devfn);
+ func = PCI_FUNC(dev->devfn);
+ }
+ else
+ {
+ bus = dev->info.physfn.bus;
+ slot = PCI_SLOT(dev->info.physfn.devfn);
+ func = PCI_FUNC(dev->info.physfn.devfn);
+ }
+
+ return !!(pci_conf_read16(dev->seg, bus, slot, func, PCI_COMMAND) &
+ PCI_COMMAND_MEMORY);
+}
+
/*
* MSI message composition
*/
@@ -166,7 +187,7 @@ void msi_compose_msg(unsigned vector, co
}
}
-static void read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
+static bool_t read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
{
switch ( entry->msi_attrib.type )
{
@@ -201,6 +222,8 @@ static void read_msi_msg(struct msi_desc
{
void __iomem *base = entry->mask_base;
+ if ( unlikely(!memory_decoded(entry->dev)) )
+ return 0;
msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
@@ -212,6 +235,8 @@ static void read_msi_msg(struct msi_desc
if ( iommu_intremap )
iommu_read_msi_from_ire(entry, msg);
+
+ return 1;
}
static int write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
@@ -262,6 +287,8 @@ static int write_msi_msg(struct msi_desc
{
void __iomem *base = entry->mask_base;
+ if ( unlikely(!memory_decoded(entry->dev)) )
+ return -ENXIO;
writel(msg->address_lo,
base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
writel(msg->address_hi,
@@ -289,7 +316,8 @@ void set_msi_affinity(struct irq_desc *d
ASSERT(spin_is_locked(&desc->lock));
memset(&msg, 0, sizeof(msg));
- read_msi_msg(msi_desc, &msg);
+ if ( !read_msi_msg(msi_desc, &msg) )
+ return;
msg.data &= ~MSI_DATA_VECTOR_MASK;
msg.data |= MSI_DATA_VECTOR(desc->arch.vector);
@@ -349,23 +377,27 @@ int msi_maskable_irq(const struct msi_de
|| entry->msi_attrib.maskbit;
}
-static void msi_set_mask_bit(struct irq_desc *desc, bool_t host, bool_t guest)
+static bool_t msi_set_mask_bit(struct irq_desc *desc, bool_t host, bool_t guest)
{
struct msi_desc *entry = desc->msi_desc;
+ struct pci_dev *pdev;
+ u16 seg;
+ u8 bus, slot, func;
bool_t flag = host || guest;
ASSERT(spin_is_locked(&desc->lock));
BUG_ON(!entry || !entry->dev);
+ pdev = entry->dev;
+ seg = pdev->seg;
+ bus = pdev->bus;
+ slot = PCI_SLOT(pdev->devfn);
+ func = PCI_FUNC(pdev->devfn);
switch ( entry->msi_attrib.type )
{
case PCI_CAP_ID_MSI:
if ( entry->msi_attrib.maskbit )
{
u32 mask_bits;
- u16 seg = entry->dev->seg;
- u8 bus = entry->dev->bus;
- u8 slot = PCI_SLOT(entry->dev->devfn);
- u8 func = PCI_FUNC(entry->dev->devfn);
mask_bits = pci_conf_read32(seg, bus, slot, func, entry->msi.mpos);
mask_bits &= ~((u32)1 << entry->msi_attrib.entry_nr);
@@ -374,25 +406,54 @@ static void msi_set_mask_bit(struct irq_
}
break;
case PCI_CAP_ID_MSIX:
- {
- int offset = PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
- writel(flag, entry->mask_base + offset);
- readl(entry->mask_base + offset);
- break;
- }
+ if ( likely(memory_decoded(pdev)) )
+ {
+ writel(flag, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+ readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+ break;
+ }
+ if ( flag )
+ {
+ u16 control;
+ domid_t domid = pdev->domain->domain_id;
+
+ pdev->msix->host_maskall = 1;
+ control = pci_conf_read16(seg, bus, slot, func,
+ msix_control_reg(entry->msi_attrib.pos));
+ if ( control & PCI_MSIX_FLAGS_MASKALL )
+ break;
+ pci_conf_write16(seg, bus, slot, func,
+ msix_control_reg(entry->msi_attrib.pos),
+ control | PCI_MSIX_FLAGS_MASKALL);
+ if ( pdev->msix->warned != domid )
+ {
+ pdev->msix->warned = domid;
+ printk(XENLOG_G_WARNING
+ "cannot mask IRQ %d: masked MSI-X on Dom%d's %04x:%02x:%02x.%u\n",
+ desc->irq, domid, pdev->seg, pdev->bus,
+ PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+ }
+ break;
+ }
+ /* fall through */
default:
- BUG();
- break;
+ return 0;
}
entry->msi_attrib.host_masked = host;
entry->msi_attrib.guest_masked = guest;
+
+ return 1;
}
static int msi_get_mask_bit(const struct msi_desc *entry)
{
- switch (entry->msi_attrib.type) {
+ if ( !entry->dev )
+ return -1;
+
+ switch ( entry->msi_attrib.type )
+ {
case PCI_CAP_ID_MSI:
- if (!entry->dev || !entry->msi_attrib.maskbit)
+ if ( !entry->msi_attrib.maskbit )
break;
return (pci_conf_read32(entry->dev->seg, entry->dev->bus,
PCI_SLOT(entry->dev->devfn),
@@ -400,6 +461,8 @@ static int msi_get_mask_bit(const struct
entry->msi.mpos) >>
entry->msi_attrib.entry_nr) & 1;
case PCI_CAP_ID_MSIX:
+ if ( unlikely(!memory_decoded(entry->dev)) )
+ break;
return readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) & 1;
}
return -1;
@@ -407,12 +470,16 @@ static int msi_get_mask_bit(const struct
void mask_msi_irq(struct irq_desc *desc)
{
- msi_set_mask_bit(desc, 1, desc->msi_desc->msi_attrib.guest_masked);
+ if ( unlikely(!msi_set_mask_bit(desc, 1,
+ desc->msi_desc->msi_attrib.guest_masked)) )
+ BUG_ON(!(desc->status & IRQ_DISABLED));
}
void unmask_msi_irq(struct irq_desc *desc)
{
- msi_set_mask_bit(desc, 0, desc->msi_desc->msi_attrib.guest_masked);
+ if ( unlikely(!msi_set_mask_bit(desc, 0,
+ desc->msi_desc->msi_attrib.guest_masked)) )
+ WARN();
}
void guest_mask_msi_irq(struct irq_desc *desc, bool_t mask)
@@ -422,13 +489,15 @@ void guest_mask_msi_irq(struct irq_desc
static unsigned int startup_msi_irq(struct irq_desc *desc)
{
- msi_set_mask_bit(desc, 0, !!(desc->status & IRQ_GUEST));
+ if ( unlikely(!msi_set_mask_bit(desc, 0, !!(desc->status & IRQ_GUEST))) )
+ WARN();
return 0;
}
static void shutdown_msi_irq(struct irq_desc *desc)
{
- msi_set_mask_bit(desc, 1, 1);
+ if ( unlikely(!msi_set_mask_bit(desc, 1, 1)) )
+ BUG_ON(!(desc->status & IRQ_DISABLED));
}
void ack_nonmaskable_msi_irq(struct irq_desc *desc)
@@ -740,6 +809,9 @@ static int msix_capability_init(struct p
control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
+ if ( unlikely(!memory_decoded(dev)) )
+ return -ENXIO;
+
if ( desc )
{
entry = alloc_msi_entry(1);
@@ -879,7 +951,8 @@ static int msix_capability_init(struct p
++msix->used_entries;
/* Restore MSI-X enabled bits */
- pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
+ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+ control & ~PCI_MSIX_FLAGS_MASKALL);
return 0;
}
@@ -1024,8 +1097,16 @@ static void __pci_disable_msix(struct ms
BUG_ON(list_empty(&dev->msi_list));
- writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
-
+ if ( likely(memory_decoded(dev)) )
+ writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+ else if ( !(control & PCI_MSIX_FLAGS_MASKALL) )
+ {
+ printk(XENLOG_WARNING
+ "cannot disable IRQ %d: masking MSI-X on %04x:%02x:%02x.%u\n",
+ entry->irq, dev->seg, dev->bus,
+ PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
+ control |= PCI_MSIX_FLAGS_MASKALL;
+ }
pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
_pci_cleanup_msix(dev->msix);
@@ -1199,15 +1280,24 @@ int pci_restore_msi_state(struct pci_dev
nr = entry->msi.nvec;
}
else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
+ {
msix_set_enable(pdev, 0);
+ if ( unlikely(!memory_decoded(pdev)) )
+ {
+ spin_unlock_irqrestore(&desc->lock, flags);
+ return -ENXIO;
+ }
+ }
msg = entry->msg;
write_msi_msg(entry, &msg);
for ( i = 0; ; )
{
- msi_set_mask_bit(desc, entry[i].msi_attrib.host_masked,
- entry[i].msi_attrib.guest_masked);
+ if ( unlikely(!msi_set_mask_bit(desc,
+ entry[i].msi_attrib.host_masked,
+ entry[i].msi_attrib.guest_masked)) )
+ BUG();
if ( !--nr )
break;

View File

@ -1,335 +0,0 @@
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
# Commit 0dba393db07331e9cff42df10e95b67547dfdb3e
# Date 2015-07-23 10:15:39 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/MSI-X: access MSI-X table only after having enabled MSI-X
As done in Linux by f598282f51 ("PCI: Fix the NIU MSI-X problem in a
better way") and its broken predecessor, make sure we don't access the
MSI-X table without having enabled MSI-X first, using the mask-all flag
instead to prevent interrupts from occurring.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -144,6 +144,17 @@ static bool_t memory_decoded(const struc
PCI_COMMAND_MEMORY);
}
+static bool_t msix_memory_decoded(const struct pci_dev *dev, unsigned int pos)
+{
+ u16 control = pci_conf_read16(dev->seg, dev->bus, PCI_SLOT(dev->devfn),
+ PCI_FUNC(dev->devfn), msix_control_reg(pos));
+
+ if ( !(control & PCI_MSIX_FLAGS_ENABLE) )
+ return 0;
+
+ return memory_decoded(dev);
+}
+
/*
* MSI message composition
*/
@@ -222,7 +233,8 @@ static bool_t read_msi_msg(struct msi_de
{
void __iomem *base = entry->mask_base;
- if ( unlikely(!memory_decoded(entry->dev)) )
+ if ( unlikely(!msix_memory_decoded(entry->dev,
+ entry->msi_attrib.pos)) )
return 0;
msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
@@ -287,7 +299,8 @@ static int write_msi_msg(struct msi_desc
{
void __iomem *base = entry->mask_base;
- if ( unlikely(!memory_decoded(entry->dev)) )
+ if ( unlikely(!msix_memory_decoded(entry->dev,
+ entry->msi_attrib.pos)) )
return -ENXIO;
writel(msg->address_lo,
base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
@@ -381,9 +394,9 @@ static bool_t msi_set_mask_bit(struct ir
{
struct msi_desc *entry = desc->msi_desc;
struct pci_dev *pdev;
- u16 seg;
+ u16 seg, control;
u8 bus, slot, func;
- bool_t flag = host || guest;
+ bool_t flag = host || guest, maskall;
ASSERT(spin_is_locked(&desc->lock));
BUG_ON(!entry || !entry->dev);
@@ -406,36 +419,45 @@ static bool_t msi_set_mask_bit(struct ir
}
break;
case PCI_CAP_ID_MSIX:
+ maskall = pdev->msix->host_maskall;
+ control = pci_conf_read16(seg, bus, slot, func,
+ msix_control_reg(entry->msi_attrib.pos));
+ if ( unlikely(!(control & PCI_MSIX_FLAGS_ENABLE)) )
+ {
+ pdev->msix->host_maskall = 1;
+ pci_conf_write16(seg, bus, slot, func,
+ msix_control_reg(entry->msi_attrib.pos),
+ control | (PCI_MSIX_FLAGS_ENABLE |
+ PCI_MSIX_FLAGS_MASKALL));
+ }
if ( likely(memory_decoded(pdev)) )
{
writel(flag, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
- break;
+ if ( likely(control & PCI_MSIX_FLAGS_ENABLE) )
+ break;
+ flag = 1;
}
- if ( flag )
+ else if ( flag && !(control & PCI_MSIX_FLAGS_MASKALL) )
{
- u16 control;
domid_t domid = pdev->domain->domain_id;
- pdev->msix->host_maskall = 1;
- control = pci_conf_read16(seg, bus, slot, func,
- msix_control_reg(entry->msi_attrib.pos));
- if ( control & PCI_MSIX_FLAGS_MASKALL )
- break;
- pci_conf_write16(seg, bus, slot, func,
- msix_control_reg(entry->msi_attrib.pos),
- control | PCI_MSIX_FLAGS_MASKALL);
+ maskall = 1;
if ( pdev->msix->warned != domid )
{
pdev->msix->warned = domid;
printk(XENLOG_G_WARNING
- "cannot mask IRQ %d: masked MSI-X on Dom%d's %04x:%02x:%02x.%u\n",
+ "cannot mask IRQ %d: masking MSI-X on Dom%d's %04x:%02x:%02x.%u\n",
desc->irq, domid, pdev->seg, pdev->bus,
PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
}
- break;
}
- /* fall through */
+ pdev->msix->host_maskall = maskall;
+ if ( maskall || pdev->msix->guest_maskall )
+ control |= PCI_MSIX_FLAGS_MASKALL;
+ pci_conf_write16(seg, bus, slot, func,
+ msix_control_reg(entry->msi_attrib.pos), control);
+ return flag;
default:
return 0;
}
@@ -461,7 +483,8 @@ static int msi_get_mask_bit(const struct
entry->msi.mpos) >>
entry->msi_attrib.entry_nr) & 1;
case PCI_CAP_ID_MSIX:
- if ( unlikely(!memory_decoded(entry->dev)) )
+ if ( unlikely(!msix_memory_decoded(entry->dev,
+ entry->msi_attrib.pos)) )
break;
return readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) & 1;
}
@@ -564,9 +587,31 @@ static struct msi_desc *alloc_msi_entry(
int setup_msi_irq(struct irq_desc *desc, struct msi_desc *msidesc)
{
- return __setup_msi_irq(desc, msidesc,
- msi_maskable_irq(msidesc) ? &pci_msi_maskable
- : &pci_msi_nonmaskable);
+ const struct pci_dev *pdev = msidesc->dev;
+ unsigned int cpos = msix_control_reg(msidesc->msi_attrib.pos);
+ u16 control = ~0;
+ int rc;
+
+ if ( msidesc->msi_attrib.type == PCI_CAP_ID_MSIX )
+ {
+ control = pci_conf_read16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+ PCI_FUNC(pdev->devfn), cpos);
+ if ( !(control & PCI_MSIX_FLAGS_ENABLE) )
+ pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+ PCI_FUNC(pdev->devfn), cpos,
+ control | (PCI_MSIX_FLAGS_ENABLE |
+ PCI_MSIX_FLAGS_MASKALL));
+ }
+
+ rc = __setup_msi_irq(desc, msidesc,
+ msi_maskable_irq(msidesc) ? &pci_msi_maskable
+ : &pci_msi_nonmaskable);
+
+ if ( !(control & PCI_MSIX_FLAGS_ENABLE) )
+ pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+ PCI_FUNC(pdev->devfn), cpos, control);
+
+ return rc;
}
int __setup_msi_irq(struct irq_desc *desc, struct msi_desc *msidesc,
@@ -803,20 +848,38 @@ static int msix_capability_init(struct p
u8 bus = dev->bus;
u8 slot = PCI_SLOT(dev->devfn);
u8 func = PCI_FUNC(dev->devfn);
+ bool_t maskall = msix->host_maskall;
ASSERT(spin_is_locked(&pcidevs_lock));
control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
- msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
+ /*
+ * Ensure MSI-X interrupts are masked during setup. Some devices require
+ * MSI-X to be enabled before we can touch the MSI-X registers. We need
+ * to mask all the vectors to prevent interrupts coming in before they're
+ * fully set up.
+ */
+ msix->host_maskall = 1;
+ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+ control | (PCI_MSIX_FLAGS_ENABLE |
+ PCI_MSIX_FLAGS_MASKALL));
if ( unlikely(!memory_decoded(dev)) )
+ {
+ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+ control & ~PCI_MSIX_FLAGS_ENABLE);
return -ENXIO;
+ }
if ( desc )
{
entry = alloc_msi_entry(1);
if ( !entry )
+ {
+ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+ control & ~PCI_MSIX_FLAGS_ENABLE);
return -ENOMEM;
+ }
ASSERT(msi);
}
@@ -847,6 +910,8 @@ static int msix_capability_init(struct p
{
if ( !msi || !msi->table_base )
{
+ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+ control & ~PCI_MSIX_FLAGS_ENABLE);
xfree(entry);
return -ENXIO;
}
@@ -889,6 +954,8 @@ static int msix_capability_init(struct p
if ( idx < 0 )
{
+ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+ control & ~PCI_MSIX_FLAGS_ENABLE);
xfree(entry);
return idx;
}
@@ -915,7 +982,7 @@ static int msix_capability_init(struct p
if ( !msix->used_entries )
{
- msix->host_maskall = 0;
+ maskall = 0;
if ( !msix->guest_maskall )
control &= ~PCI_MSIX_FLAGS_MASKALL;
else
@@ -951,8 +1018,8 @@ static int msix_capability_init(struct p
++msix->used_entries;
/* Restore MSI-X enabled bits */
- pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
- control & ~PCI_MSIX_FLAGS_MASKALL);
+ msix->host_maskall = maskall;
+ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
return 0;
}
@@ -1092,8 +1159,15 @@ static void __pci_disable_msix(struct ms
PCI_CAP_ID_MSIX);
u16 control = pci_conf_read16(seg, bus, slot, func,
msix_control_reg(entry->msi_attrib.pos));
+ bool_t maskall = dev->msix->host_maskall;
- msix_set_enable(dev, 0);
+ if ( unlikely(!(control & PCI_MSIX_FLAGS_ENABLE)) )
+ {
+ dev->msix->host_maskall = 1;
+ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+ control | (PCI_MSIX_FLAGS_ENABLE |
+ PCI_MSIX_FLAGS_MASKALL));
+ }
BUG_ON(list_empty(&dev->msi_list));
@@ -1105,8 +1179,11 @@ static void __pci_disable_msix(struct ms
"cannot disable IRQ %d: masking MSI-X on %04x:%02x:%02x.%u\n",
entry->irq, dev->seg, dev->bus,
PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
- control |= PCI_MSIX_FLAGS_MASKALL;
+ maskall = 1;
}
+ dev->msix->host_maskall = maskall;
+ if ( maskall || dev->msix->guest_maskall )
+ control |= PCI_MSIX_FLAGS_MASKALL;
pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
_pci_cleanup_msix(dev->msix);
@@ -1255,6 +1332,8 @@ int pci_restore_msi_state(struct pci_dev
list_for_each_entry_safe( entry, tmp, &pdev->msi_list, list )
{
unsigned int i = 0, nr = 1;
+ u16 control = 0;
+ u8 slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
irq = entry->irq;
desc = &irq_desc[irq];
@@ -1281,10 +1360,18 @@ int pci_restore_msi_state(struct pci_dev
}
else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
{
- msix_set_enable(pdev, 0);
+ control = pci_conf_read16(pdev->seg, pdev->bus, slot, func,
+ msix_control_reg(entry->msi_attrib.pos));
+ pci_conf_write16(pdev->seg, pdev->bus, slot, func,
+ msix_control_reg(entry->msi_attrib.pos),
+ control | (PCI_MSIX_FLAGS_ENABLE |
+ PCI_MSIX_FLAGS_MASKALL));
if ( unlikely(!memory_decoded(pdev)) )
{
spin_unlock_irqrestore(&desc->lock, flags);
+ pci_conf_write16(pdev->seg, pdev->bus, slot, func,
+ msix_control_reg(entry->msi_attrib.pos),
+ control & ~PCI_MSIX_FLAGS_ENABLE);
return -ENXIO;
}
}
@@ -1314,11 +1401,9 @@ int pci_restore_msi_state(struct pci_dev
if ( entry->msi_attrib.type == PCI_CAP_ID_MSI )
{
unsigned int cpos = msi_control_reg(entry->msi_attrib.pos);
- u16 control = pci_conf_read16(pdev->seg, pdev->bus,
- PCI_SLOT(pdev->devfn),
- PCI_FUNC(pdev->devfn), cpos);
- control &= ~PCI_MSI_FLAGS_QSIZE;
+ control = pci_conf_read16(pdev->seg, pdev->bus, slot, func, cpos) &
+ ~PCI_MSI_FLAGS_QSIZE;
multi_msi_enable(control, entry->msi.nvec);
pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
PCI_FUNC(pdev->devfn), cpos, control);
@@ -1326,7 +1411,9 @@ int pci_restore_msi_state(struct pci_dev
msi_set_enable(pdev, 1);
}
else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
- msix_set_enable(pdev, 1);
+ pci_conf_write16(pdev->seg, pdev->bus, slot, func,
+ msix_control_reg(entry->msi_attrib.pos),
+ control | PCI_MSIX_FLAGS_ENABLE);
}
return 0;

View File

@ -1,55 +0,0 @@
References: bsc#907514 bsc#910258 bsc#918984 bsc#923967
# Commit aa7c1fdf9dd04a1287f4770906b2c41b88a28228
# Date 2015-07-23 10:16:27 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/MSI: properly track guest masking requests
... by monitoring writes to the mask register.
This allows reverting the main effect of the XSA-129 patches in qemu.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -1303,6 +1303,37 @@ int pci_msi_conf_write_intercept(struct
return 1;
}
+ entry = find_msi_entry(pdev, -1, PCI_CAP_ID_MSI);
+ if ( entry && entry->msi_attrib.maskbit )
+ {
+ uint16_t cntl;
+ uint32_t unused;
+
+ pos = entry->msi_attrib.pos;
+ if ( reg < pos || reg >= entry->msi.mpos + 8 )
+ return 0;
+
+ if ( reg == msi_control_reg(pos) )
+ return size == 2 ? 1 : -EACCES;
+ if ( reg < entry->msi.mpos || reg >= entry->msi.mpos + 4 || size != 4 )
+ return -EACCES;
+
+ cntl = pci_conf_read16(seg, bus, slot, func, msi_control_reg(pos));
+ unused = ~(uint32_t)0 >> (32 - multi_msi_capable(cntl));
+ for ( pos = 0; pos < entry->msi.nvec; ++pos, ++entry )
+ {
+ entry->msi_attrib.guest_masked =
+ *data >> entry->msi_attrib.entry_nr;
+ if ( entry->msi_attrib.host_masked )
+ *data |= 1 << pos;
+ unused &= ~(1 << pos);
+ }
+
+ *data |= unused;
+
+ return 1;
+ }
+
return 0;
}

View File

@ -1,63 +0,0 @@
# Commit a7bd9b1661304500cd18b7d216d616ecf053ebdb
# Date 2015-08-05 10:32:45 +0100
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Ian Campbell <ian.campbell@citrix.com>
x86/gdt: Drop write-only, xalloc()'d array from set_gdt()
It is not used, and can cause a spurious failure of the set_gdt() hypercall in
low memory situations.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Wei Liu <wei.liu2@citrix.com>
Reviewed-by: Ian Campbell <ian.campbell@citrix.com>
Reviewed-by: George Dunlap <george.dunlap@eu.citrix.com>
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -4383,20 +4383,15 @@ long set_gdt(struct vcpu *v,
l1_pgentry_t *pl1e;
/* NB. There are 512 8-byte entries per GDT page. */
int i, nr_pages = (entries + 511) / 512;
- unsigned long mfn, *pfns;
if ( entries > FIRST_RESERVED_GDT_ENTRY )
return -EINVAL;
- pfns = xmalloc_array(unsigned long, nr_pages);
- if ( !pfns )
- return -ENOMEM;
-
/* Check the pages in the new GDT. */
for ( i = 0; i < nr_pages; i++ )
{
struct page_info *page;
- pfns[i] = frames[i];
+
page = get_page_from_gfn(d, frames[i], NULL, P2M_ALLOC);
if ( !page )
goto fail;
@@ -4405,7 +4400,7 @@ long set_gdt(struct vcpu *v,
put_page(page);
goto fail;
}
- mfn = frames[i] = page_to_mfn(page);
+ frames[i] = page_to_mfn(page);
}
/* Tear down the old GDT. */
@@ -4420,7 +4415,6 @@ long set_gdt(struct vcpu *v,
l1e_write(&pl1e[i], l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
}
- xfree(pfns);
return 0;
fail:
@@ -4428,7 +4422,6 @@ long set_gdt(struct vcpu *v,
{
put_page_and_type(mfn_to_page(frames[i]));
}
- xfree(pfns);
return -EINVAL;
}

View File

@ -1,169 +0,0 @@
# Commit 0174da5b79752e2d5d6ca0faed89536e8f3d91c7
# Date 2015-08-06 10:04:43 +0100
# Author Anshul Makkar <anshul.makkar@citrix.com>
# Committer Ian Campbell <ian.campbell@citrix.com>
x86/mm: Make {hap, shadow}_teardown() preemptible
A domain with sufficient shadow allocation can cause a watchdog timeout
during domain destruction. Expand the existing -ERESTART logic in
paging_teardown() to allow {hap/sh}_set_allocation() to become
restartable during the DOMCTL_destroydomain hypercall.
Signed-off-by: Anshul Makkar <anshul.makkar@citrix.com>
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Tim Deegan <tim@xen.org>
Reviewed-by: George Dunlap <george.dunlap@eu.citrix.com>
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -503,7 +503,7 @@ void hap_final_teardown(struct domain *d
}
if ( d->arch.paging.hap.total_pages != 0 )
- hap_teardown(d);
+ hap_teardown(d, NULL);
p2m_teardown(p2m_get_hostp2m(d));
/* Free any memory that the p2m teardown released */
@@ -513,7 +513,7 @@ void hap_final_teardown(struct domain *d
paging_unlock(d);
}
-void hap_teardown(struct domain *d)
+void hap_teardown(struct domain *d, int *preempted)
{
struct vcpu *v;
mfn_t mfn;
@@ -541,18 +541,11 @@ void hap_teardown(struct domain *d)
if ( d->arch.paging.hap.total_pages != 0 )
{
- HAP_PRINTK("teardown of domain %u starts."
- " pages total = %u, free = %u, p2m=%u\n",
- d->domain_id,
- d->arch.paging.hap.total_pages,
- d->arch.paging.hap.free_pages,
- d->arch.paging.hap.p2m_pages);
- hap_set_allocation(d, 0, NULL);
- HAP_PRINTK("teardown done."
- " pages total = %u, free = %u, p2m=%u\n",
- d->arch.paging.hap.total_pages,
- d->arch.paging.hap.free_pages,
- d->arch.paging.hap.p2m_pages);
+ hap_set_allocation(d, 0, preempted);
+
+ if ( preempted && *preempted )
+ goto out;
+
ASSERT(d->arch.paging.hap.total_pages == 0);
}
@@ -561,6 +554,7 @@ void hap_teardown(struct domain *d)
xfree(d->arch.hvm_domain.dirty_vram);
d->arch.hvm_domain.dirty_vram = NULL;
+out:
paging_unlock(d);
}
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -779,12 +779,15 @@ long paging_domctl_continuation(XEN_GUES
/* Call when destroying a domain */
int paging_teardown(struct domain *d)
{
- int rc;
+ int rc, preempted = 0;
if ( hap_enabled(d) )
- hap_teardown(d);
+ hap_teardown(d, &preempted);
else
- shadow_teardown(d);
+ shadow_teardown(d, &preempted);
+
+ if ( preempted )
+ return -ERESTART;
/* clean up log dirty resources. */
rc = paging_free_log_dirty_bitmap(d, 0);
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -3030,7 +3030,7 @@ int shadow_enable(struct domain *d, u32
return rv;
}
-void shadow_teardown(struct domain *d)
+void shadow_teardown(struct domain *d, int *preempted)
/* Destroy the shadow pagetables of this domain and free its shadow memory.
* Should only be called for dying domains. */
{
@@ -3091,23 +3091,16 @@ void shadow_teardown(struct domain *d)
if ( d->arch.paging.shadow.total_pages != 0 )
{
- SHADOW_PRINTK("teardown of domain %u starts."
- " Shadow pages total = %u, free = %u, p2m=%u\n",
- d->domain_id,
- d->arch.paging.shadow.total_pages,
- d->arch.paging.shadow.free_pages,
- d->arch.paging.shadow.p2m_pages);
/* Destroy all the shadows and release memory to domheap */
- sh_set_allocation(d, 0, NULL);
+ sh_set_allocation(d, 0, preempted);
+
+ if ( preempted && *preempted )
+ goto out;
+
/* Release the hash table back to xenheap */
if (d->arch.paging.shadow.hash_table)
shadow_hash_teardown(d);
- /* Should not have any more memory held */
- SHADOW_PRINTK("teardown done."
- " Shadow pages total = %u, free = %u, p2m=%u\n",
- d->arch.paging.shadow.total_pages,
- d->arch.paging.shadow.free_pages,
- d->arch.paging.shadow.p2m_pages);
+
ASSERT(d->arch.paging.shadow.total_pages == 0);
}
@@ -3138,6 +3131,7 @@ void shadow_teardown(struct domain *d)
d->arch.hvm_domain.dirty_vram = NULL;
}
+out:
paging_unlock(d);
/* Must be called outside the lock */
@@ -3159,7 +3153,7 @@ void shadow_final_teardown(struct domain
* It is possible for a domain that never got domain_kill()ed
* to get here with its shadow allocation intact. */
if ( d->arch.paging.shadow.total_pages != 0 )
- shadow_teardown(d);
+ shadow_teardown(d, NULL);
/* It is now safe to pull down the p2m map. */
p2m_teardown(p2m_get_hostp2m(d));
--- a/xen/include/asm-x86/hap.h
+++ b/xen/include/asm-x86/hap.h
@@ -54,7 +54,7 @@ int hap_domctl(struct domain *d, xen_d
XEN_GUEST_HANDLE_PARAM(void) u_domctl);
int hap_enable(struct domain *d, u32 mode);
void hap_final_teardown(struct domain *d);
-void hap_teardown(struct domain *d);
+void hap_teardown(struct domain *d, int *preempted);
void hap_vcpu_init(struct vcpu *v);
int hap_track_dirty_vram(struct domain *d,
unsigned long begin_pfn,
--- a/xen/include/asm-x86/shadow.h
+++ b/xen/include/asm-x86/shadow.h
@@ -72,7 +72,7 @@ int shadow_domctl(struct domain *d,
XEN_GUEST_HANDLE_PARAM(void) u_domctl);
/* Call when destroying a domain */
-void shadow_teardown(struct domain *d);
+void shadow_teardown(struct domain *d, int *preempted);
/* Call once all of the references to the domain have gone away */
void shadow_final_teardown(struct domain *d);

View File

@ -1,96 +0,0 @@
# Commit 22c5675877c8209adcfdb6bceddb561320374529
# Date 2015-08-25 16:17:13 +0200
# Author Aravind Gopalakrishnan <aravind.gopalakrishnan@amd.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86, amd_ucode: skip microcode updates for final levels
Some of older[Fam10h] systems require that certain number of
applied microcode patch levels should not be overwritten by
the microcode loader. Otherwise, system hangs are known to occur.
The 'final_levels' of patch ids have been obtained empirically.
Refer bug https://bugzilla.suse.com/show_bug.cgi?id=913996
for details of the issue.
The short version is that people have predominantly noticed
system hang issues when trying to update microcode levels
beyond the patch IDs below.
[0x01000098, 0x0100009f, 0x010000af]
From internal discussions, we gathered that OS/hypervisor
cannot reliably perform microcode updates beyond these levels
due to hardware issues. Therefore, we need to abort microcode
update process if we hit any of these levels.
In this patch, we check for those microcode versions and abort
if the current core has one of those final patch levels applied
by the BIOS
A linux version of the patch has already made it into tip-
http://marc.info/?l=linux-kernel&m=143703405627170
Signed-off-by: Aravind Gopalakrishnan <aravind.gopalakrishnan@amd.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
--- a/xen/arch/x86/microcode_amd.c
+++ b/xen/arch/x86/microcode_amd.c
@@ -347,6 +347,43 @@ static int container_fast_forward(const
return 0;
}
+/*
+ * The 'final_levels' of patch ids have been obtained empirically.
+ * Refer bug https://bugzilla.suse.com/show_bug.cgi?id=913996
+ * for details of the issue. The short version is that people
+ * using certain Fam10h systems noticed system hang issues when
+ * trying to update microcode levels beyond the patch IDs below.
+ * From internal discussions, we gathered that OS/hypervisor
+ * cannot reliably perform microcode updates beyond these levels
+ * due to hardware issues. Therefore, we need to abort microcode
+ * update process if we hit any of these levels.
+ */
+static const unsigned int final_levels[] = {
+ 0x01000098,
+ 0x0100009f,
+ 0x010000af
+};
+
+static bool_t check_final_patch_levels(unsigned int cpu)
+{
+ /*
+ * Check the current patch levels on the cpu. If they are equal to
+ * any of the 'final_levels', then we should not update the microcode
+ * patch on the cpu as system will hang otherwise.
+ */
+ struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu);
+ unsigned int i;
+
+ if ( boot_cpu_data.x86 != 0x10 )
+ return 0;
+
+ for ( i = 0; i < ARRAY_SIZE(final_levels); i++ )
+ if ( uci->cpu_sig.rev == final_levels[i] )
+ return 1;
+
+ return 0;
+}
+
static int cpu_request_microcode(int cpu, const void *buf, size_t bufsize)
{
struct microcode_amd *mc_amd, *mc_old;
@@ -369,6 +406,14 @@ static int cpu_request_microcode(int cpu
goto out;
}
+ if ( check_final_patch_levels(cpu) )
+ {
+ printk(XENLOG_INFO
+ "microcode: Cannot update microcode patch on the cpu as we hit a final level\n");
+ error = -EPERM;
+ goto out;
+ }
+
mc_amd = xmalloc(struct microcode_amd);
if ( !mc_amd )
{

View File

@ -1,21 +0,0 @@
# Commit 5f335544cf5b716b0af51223e33373c4a7d65e8c
# Date 2015-08-27 17:40:38 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
IOMMU: skip domains without page tables when dumping
Reported-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Tested-by: Roger Pau Monné <roger.pau@citrix.com>
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -368,7 +368,7 @@ static void iommu_dump_p2m_table(unsigne
ops = iommu_get_ops();
for_each_domain(d)
{
- if ( is_hardware_domain(d) )
+ if ( is_hardware_domain(d) || need_iommu(d) <= 0 )
continue;
if ( iommu_use_hap_pt(d) )

View File

@ -1,95 +0,0 @@
# Commit 8f945d36d9bddd5b589ba23c7322b30d623dd084
# Date 2015-08-31 13:51:52 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/NUMA: fix setup_node()
The function referenced an __initdata object (nodes_found). Since this
being a node mask was more complicated than needed, the variable gets
replaced by a simple counter. Check at once that the count of nodes
doesn't go beyond MAX_NUMNODES.
Also consolidate three printk()s related to the function's use into just
one.
Finally (quite the opposite of the above issue) __init-annotate
nodes_cover_memory().
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/srat.c
+++ b/xen/arch/x86/srat.c
@@ -25,7 +25,6 @@ static struct acpi_table_slit *__read_mo
static nodemask_t memory_nodes_parsed __initdata;
static nodemask_t processor_nodes_parsed __initdata;
-static nodemask_t nodes_found __initdata;
static struct node nodes[MAX_NUMNODES] __initdata;
static u8 __read_mostly pxm2node[256] = { [0 ... 255] = NUMA_NO_NODE };
@@ -45,17 +44,25 @@ int pxm_to_node(int pxm)
return (signed char)pxm2node[pxm];
}
-__devinit int setup_node(int pxm)
+int setup_node(int pxm)
{
unsigned node = pxm2node[pxm];
- if (node == 0xff) {
- if (nodes_weight(nodes_found) >= MAX_NUMNODES)
+
+ if (node == NUMA_NO_NODE) {
+ static bool_t warned;
+ static unsigned nodes_found;
+
+ node = nodes_found++;
+ if (node >= MAX_NUMNODES) {
+ printk(KERN_WARNING
+ "SRAT: Too many proximity domains (%#x)\n",
+ pxm);
+ warned = 1;
return -1;
- node = first_unset_node(nodes_found);
- node_set(node, nodes_found);
+ }
pxm2node[pxm] = node;
}
- return pxm2node[pxm];
+ return node;
}
int valid_numa_range(u64 start, u64 end, int node)
@@ -176,7 +183,6 @@ acpi_numa_x2apic_affinity_init(struct ac
pxm = pa->proximity_domain;
node = setup_node(pxm);
if (node < 0) {
- printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
bad_srat();
return;
}
@@ -209,7 +215,6 @@ acpi_numa_processor_affinity_init(struct
}
node = setup_node(pxm);
if (node < 0) {
- printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
bad_srat();
return;
}
@@ -253,7 +258,6 @@ acpi_numa_memory_affinity_init(struct ac
pxm &= 0xff;
node = setup_node(pxm);
if (node < 0) {
- printk(KERN_ERR "SRAT: Too many proximity domains.\n");
bad_srat();
return;
}
@@ -295,7 +299,7 @@ acpi_numa_memory_affinity_init(struct ac
/* Sanity check to catch more bad SRATs (they are amazingly common).
Make sure the PXMs cover all memory. */
-static int nodes_cover_memory(void)
+static int __init nodes_cover_memory(void)
{
int i;

View File

@ -1,132 +0,0 @@
# Commit c011f470e6e79208f5baa071b4d072b78c88e2ba
# Date 2015-08-31 13:52:24 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/NUMA: don't account hotplug regions
... except in cases where they really matter: node_memblk_range[] now
is the only place all regions get stored. nodes[] and NODE_DATA() track
present memory only. This improves the reporting when nodes have
disjoint "normal" and hotplug regions, with the hotplug region sitting
above the highest populated page. In such cases a node's spanned-pages
value (visible in both XEN_SYSCTL_numainfo and 'u' debug key output)
covered all the way up to top of populated memory, giving quite
different a picture from what an otherwise identically configured
system without and hotplug regions would report. Note, however, that
the actual hotplug case (as well as cases of nodes with multiple
disjoint present regions) is still not being handled such that the
reported values would represent how much memory a node really has (but
that can be considered intentional).
Reported-by: Jim Fehlig <jfehlig@suse.com>
This at once makes nodes_cover_memory() no longer consider E820_RAM
regions covered by SRAT hotplug regions.
Also reject self-overlaps with mismatching hotplug flags.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Tested-by: Jim Fehlig <jfehlig@suse.com>
--- a/xen/arch/x86/srat.c
+++ b/xen/arch/x86/srat.c
@@ -32,7 +32,7 @@ static u8 __read_mostly pxm2node[256] =
static int num_node_memblks;
static struct node node_memblk_range[NR_NODE_MEMBLKS];
static int memblk_nodeid[NR_NODE_MEMBLKS];
-
+static __initdata DECLARE_BITMAP(memblk_hotplug, NR_NODE_MEMBLKS);
static int node_to_pxm(int n);
@@ -89,9 +89,9 @@ static __init int conflicting_memblks(u6
if (nd->start == nd->end)
continue;
if (nd->end > start && nd->start < end)
- return memblk_nodeid[i];
+ return i;
if (nd->end == end && nd->start == start)
- return memblk_nodeid[i];
+ return i;
}
return -1;
}
@@ -229,7 +229,6 @@ acpi_numa_processor_affinity_init(struct
void __init
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
{
- struct node *nd;
u64 start, end;
int node, pxm;
int i;
@@ -263,30 +262,40 @@ acpi_numa_memory_affinity_init(struct ac
}
/* It is fine to add this area to the nodes data it will be used later*/
i = conflicting_memblks(start, end);
- if (i == node) {
- printk(KERN_WARNING
- "SRAT: Warning: PXM %d (%"PRIx64"-%"PRIx64") overlaps with itself (%"
- PRIx64"-%"PRIx64")\n", pxm, start, end, nodes[i].start, nodes[i].end);
- } else if (i >= 0) {
+ if (i < 0)
+ /* everything fine */;
+ else if (memblk_nodeid[i] == node) {
+ bool_t mismatch = !(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) !=
+ !test_bit(i, memblk_hotplug);
+
+ printk("%sSRAT: PXM %u (%"PRIx64"-%"PRIx64") overlaps with itself (%"PRIx64"-%"PRIx64")\n",
+ mismatch ? KERN_ERR : KERN_WARNING, pxm, start, end,
+ node_memblk_range[i].start, node_memblk_range[i].end);
+ if (mismatch) {
+ bad_srat();
+ return;
+ }
+ } else {
printk(KERN_ERR
- "SRAT: PXM %d (%"PRIx64"-%"PRIx64") overlaps with PXM %d (%"
- PRIx64"-%"PRIx64")\n", pxm, start, end, node_to_pxm(i),
- nodes[i].start, nodes[i].end);
+ "SRAT: PXM %u (%"PRIx64"-%"PRIx64") overlaps with PXM %u (%"PRIx64"-%"PRIx64")\n",
+ pxm, start, end, node_to_pxm(memblk_nodeid[i]),
+ node_memblk_range[i].start, node_memblk_range[i].end);
bad_srat();
return;
}
- nd = &nodes[node];
- if (!node_test_and_set(node, memory_nodes_parsed)) {
- nd->start = start;
- nd->end = end;
- } else {
- if (start < nd->start)
+ if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)) {
+ struct node *nd = &nodes[node];
+
+ if (!node_test_and_set(node, memory_nodes_parsed)) {
nd->start = start;
- if (nd->end < end)
nd->end = end;
+ } else {
+ if (start < nd->start)
+ nd->start = start;
+ if (nd->end < end)
+ nd->end = end;
+ }
}
- if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && end > mem_hotplug)
- mem_hotplug = end;
printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"%s\n",
node, pxm, start, end,
ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE ? " (hotplug)" : "");
@@ -294,6 +303,11 @@ acpi_numa_memory_affinity_init(struct ac
node_memblk_range[num_node_memblks].start = start;
node_memblk_range[num_node_memblks].end = end;
memblk_nodeid[num_node_memblks] = node;
+ if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
+ __set_bit(num_node_memblks, memblk_hotplug);
+ if (end > mem_hotplug)
+ mem_hotplug = end;
+ }
num_node_memblks++;
}

View File

@ -1,176 +0,0 @@
# Commit 88e3ed61642bb393458acc7a9bd2f96edc337190
# Date 2015-09-01 14:02:57 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/NUMA: make init_node_heap() respect Xen heap limit
On NUMA systems, where we try to use node local memory for the basic
control structures of the buddy allocator, this special case needs to
take into consideration a possible address width limit placed on the
Xen heap. In turn this (but also other, more abstract considerations)
requires that xenheap_max_mfn() not be called more than once (at most
we might permit it to be called a second time with a larger value than
was passed the first time), and be called only before calling
end_boot_allocator().
While inspecting all the involved code, a couple of off-by-one issues
were found (and are being corrected here at once):
- arch_init_memory() cleared one too many page table slots
- the highmem_start based invocation of xenheap_max_mfn() passed too
big a value
- xenheap_max_mfn() calculated the wrong bit count in edge cases
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
# Commit 0a7167d9b20cdc48e6ea320fbbb920b3267c9757
# Date 2015-09-04 14:58:07 +0100
# Author Julien Grall <julien.grall@citrix.com>
# Committer Ian Campbell <ian.campbell@citrix.com>
xen/arm64: do not (incorrectly) limit size of xenheap
The commit 88e3ed61642bb393458acc7a9bd2f96edc337190 "x86/NUMA: make
init_node_heap() respect Xen heap limit" breaks boot on the arm64 board
X-Gene.
The xenheap bits variable is used to know the last RAM MFN always mapped
in Xen virtual memory. If the value is 0, it means that all the memory is
always mapped in Xen virtual memory.
On X-gene the RAM bank resides above 128GB and last xenheap MFN is
0x4400000. With the new way to calculate the number of bits, xenheap_bits
will be equal to 38 bits. This will result to hide all the RAM and the
impossibility to allocate xenheap memory.
Given that aarch64 have always all the memory mapped in Xen virtual
memory, it's not necessary to call xenheap_max_mfn which set the number
of bits.
Suggested-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Julien Grall <julien.grall@citrix.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
--- a/xen/arch/arm/setup.c
+++ b/xen/arch/arm/setup.c
@@ -664,7 +664,6 @@ static void __init setup_mm(unsigned lon
xenheap_virt_end = XENHEAP_VIRT_START + ram_end - ram_start;
xenheap_mfn_start = ram_start >> PAGE_SHIFT;
xenheap_mfn_end = ram_end >> PAGE_SHIFT;
- xenheap_max_mfn(xenheap_mfn_end);
/*
* Need enough mapped pages for copying the DTB.
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -372,7 +372,7 @@ void __init arch_init_memory(void)
for ( i = 0; i < l3_table_offset(split_va); ++i )
l3tab[i] = l3idle[i];
- for ( ; i <= L3_PAGETABLE_ENTRIES; ++i )
+ for ( ; i < L3_PAGETABLE_ENTRIES; ++i )
l3tab[i] = l3e_empty();
split_l4e = l4e_from_pfn(virt_to_mfn(l3tab),
__PAGE_HYPERVISOR);
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -970,7 +970,7 @@ void __init noreturn __start_xen(unsigne
setup_max_pdx(raw_max_page);
if ( highmem_start )
- xenheap_max_mfn(PFN_DOWN(highmem_start));
+ xenheap_max_mfn(PFN_DOWN(highmem_start - 1));
/*
* Walk every RAM region and map it in its entirety (on x86/64, at least)
@@ -1151,9 +1151,6 @@ void __init noreturn __start_xen(unsigne
numa_initmem_init(0, raw_max_page);
- end_boot_allocator();
- system_state = SYS_STATE_boot;
-
if ( max_page - 1 > virt_to_mfn(HYPERVISOR_VIRT_END - 1) )
{
unsigned long limit = virt_to_mfn(HYPERVISOR_VIRT_END - 1);
@@ -1162,6 +1159,8 @@ void __init noreturn __start_xen(unsigne
if ( !highmem_start )
xenheap_max_mfn(limit);
+ end_boot_allocator();
+
/* Pass the remaining memory to the allocator. */
for ( i = 0; i < boot_e820.nr_map; i++ )
{
@@ -1185,6 +1184,10 @@ void __init noreturn __start_xen(unsigne
opt_tmem = 0;
}
}
+ else
+ end_boot_allocator();
+
+ system_state = SYS_STATE_boot;
vm_init();
console_init_ring();
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -405,13 +405,19 @@ void get_outstanding_claims(uint64_t *fr
spin_unlock(&heap_lock);
}
+static bool_t __read_mostly first_node_initialised;
+#ifndef CONFIG_SEPARATE_XENHEAP
+static unsigned int __read_mostly xenheap_bits;
+#else
+#define xenheap_bits 0
+#endif
+
static unsigned long init_node_heap(int node, unsigned long mfn,
unsigned long nr, bool_t *use_tail)
{
/* First node to be discovered has its heap metadata statically alloced. */
static heap_by_zone_and_order_t _heap_static;
static unsigned long avail_static[NR_ZONES];
- static int first_node_initialised;
unsigned long needed = (sizeof(**_heap) +
sizeof(**avail) * NR_ZONES +
PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -429,14 +435,18 @@ static unsigned long init_node_heap(int
}
#ifdef DIRECTMAP_VIRT_END
else if ( *use_tail && nr >= needed &&
- (mfn + nr) <= (virt_to_mfn(eva - 1) + 1) )
+ (mfn + nr) <= (virt_to_mfn(eva - 1) + 1) &&
+ (!xenheap_bits ||
+ !((mfn + nr - 1) >> (xenheap_bits - PAGE_SHIFT))) )
{
_heap[node] = mfn_to_virt(mfn + nr - needed);
avail[node] = mfn_to_virt(mfn + nr - 1) +
PAGE_SIZE - sizeof(**avail) * NR_ZONES;
}
else if ( nr >= needed &&
- (mfn + needed) <= (virt_to_mfn(eva - 1) + 1) )
+ (mfn + needed) <= (virt_to_mfn(eva - 1) + 1) &&
+ (!xenheap_bits ||
+ !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
{
_heap[node] = mfn_to_virt(mfn);
avail[node] = mfn_to_virt(mfn + needed - 1) +
@@ -1541,11 +1551,13 @@ void free_xenheap_pages(void *v, unsigne
#else
-static unsigned int __read_mostly xenheap_bits;
-
void __init xenheap_max_mfn(unsigned long mfn)
{
- xenheap_bits = fls(mfn) + PAGE_SHIFT;
+ ASSERT(!first_node_initialised);
+ ASSERT(!xenheap_bits);
+ BUILD_BUG_ON(PADDR_BITS >= BITS_PER_LONG);
+ xenheap_bits = min(fls(mfn + 1) - 1 + PAGE_SHIFT, PADDR_BITS);
+ printk(XENLOG_INFO "Xen heap: %u bits\n", xenheap_bits);
}
void init_xenheap_pages(paddr_t ps, paddr_t pe)

View File

@ -1,68 +0,0 @@
# Commit 244582a01dcb49fa30083725964a066937cc94f2
# Date 2015-09-11 16:24:56 +0200
# Author Kouya Shimura <kouya@jp.fujitsu.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/hvm: fix saved pmtimer and hpet values
The ACPI PM timer is sometimes broken on live migration.
Since vcpu->arch.hvm_vcpu.guest_time is always zero in other than
"delay for missed ticks mode". Even in "delay for missed ticks mode",
vcpu's guest_time field is not valid (i.e. zero) when
the state of vcpu is "blocked". (see pt_save_timer function)
The original author (Tim Deegan) of pmtimer_save() must have intended
that it saves the last scheduled time of the vcpu. Unfortunately it was
already implied this bug. FYI, there is no other timer mode than
"delay for missed ticks mode" then.
For consistency with HPET, pmtimer_save() should refer hvm_get_guest_time()
to update the counter as well as hpet_save() does.
Without this patch, the clock of windows server 2012R2 without HPET
might leap forward several minutes on live migration.
Signed-off-by: Kouya Shimura <kouya@jp.fujitsu.com>
Retain use of ->arch.hvm_vcpu.guest_time when non-zero. Do the inverse
adjustment for vHPET.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Tim Deegan <tim@xen.org>
Reviewed-by: Kouya Shimura <kouya@jp.fujitsu.com>
--- a/xen/arch/x86/hvm/hpet.c
+++ b/xen/arch/x86/hvm/hpet.c
@@ -506,11 +506,13 @@ const struct hvm_mmio_handler hpet_mmio_
static int hpet_save(struct domain *d, hvm_domain_context_t *h)
{
HPETState *hp = domain_vhpet(d);
+ struct vcpu *v = pt_global_vcpu_target(d);
int rc;
uint64_t guest_time;
write_lock(&hp->lock);
- guest_time = guest_time_hpet(hp);
+ guest_time = (v->arch.hvm_vcpu.guest_time ?: hvm_get_guest_time(v)) /
+ STIME_PER_HPET_TICK;
/* Write the proper value into the main counter */
if ( hpet_enabled(hp) )
--- a/xen/arch/x86/hvm/pmtimer.c
+++ b/xen/arch/x86/hvm/pmtimer.c
@@ -250,10 +250,12 @@ static int pmtimer_save(struct domain *d
spin_lock(&s->lock);
- /* Update the counter to the guest's current time. We always save
- * with the domain paused, so the saved time should be after the
- * last_gtime, but just in case, make sure we only go forwards */
- x = ((s->vcpu->arch.hvm_vcpu.guest_time - s->last_gtime) * s->scale) >> 32;
+ /*
+ * Update the counter to the guest's current time. Make sure it only
+ * goes forwards.
+ */
+ x = (((s->vcpu->arch.hvm_vcpu.guest_time ?: hvm_get_guest_time(s->vcpu)) -
+ s->last_gtime) * s->scale) >> 32;
if ( x < 1UL<<31 )
s->pm.tmr_val += x;
if ( (s->pm.tmr_val & TMR_VAL_MSB) != msb )

View File

@ -1,23 +0,0 @@
# Commit c7d5d5d8ea1ecbd6ef8b47dace4dec825f0f6e48
# Date 2015-09-16 11:20:27 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/MSI: fail if no hardware support
This is to guard against buggy callers (luckily Dom0 only) invoking
the respective hypercall for a device not being MSI-capable.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -696,6 +696,8 @@ static int msi_capability_init(struct pc
ASSERT(spin_is_locked(&pcidevs_lock));
pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSI);
+ if ( !pos )
+ return -ENODEV;
control = pci_conf_read16(seg, bus, slot, func, msi_control_reg(pos));
maxvec = multi_msi_capable(control);
if ( nvec > maxvec )

View File

@ -34,9 +34,11 @@ Signed-off-by: Jan Beulich <jbeulich@suse.com>
Tested-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -505,12 +505,12 @@ void update_cr3(struct vcpu *v)
Index: xen-4.6.0-testing/xen/arch/x86/mm.c
===================================================================
--- xen-4.6.0-testing.orig/xen/arch/x86/mm.c
+++ xen-4.6.0-testing/xen/arch/x86/mm.c
@@ -502,12 +502,12 @@ void update_cr3(struct vcpu *v)
make_cr3(v, cr3_mfn);
}
@ -51,7 +53,7 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
struct page_info *page;
BUG_ON(unlikely(in_irq()));
@@ -525,10 +525,10 @@ static void invalidate_shadow_ldt(struct
@@ -522,10 +522,10 @@ static void invalidate_shadow_ldt(struct
for ( i = 16; i < 32; i++ )
{
@ -65,7 +67,7 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page);
ASSERT_PAGE_IS_DOMAIN(page, v->domain);
put_page_and_type(page);
@@ -4360,16 +4360,18 @@ long do_update_va_mapping_otherdomain(un
@@ -4420,16 +4420,18 @@ long do_update_va_mapping_otherdomain(un
void destroy_gdt(struct vcpu *v)
{
l1_pgentry_t *pl1e;
@ -88,7 +90,7 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
v->arch.pv_vcpu.gdt_frames[i] = 0;
}
}
@@ -4382,7 +4384,7 @@ long set_gdt(struct vcpu *v,
@@ -4442,7 +4444,7 @@ long set_gdt(struct vcpu *v,
struct domain *d = v->domain;
l1_pgentry_t *pl1e;
/* NB. There are 512 8-byte entries per GDT page. */

View File

@ -1,77 +0,0 @@
# Commit 86f3ff9fc4cc3cb69b96c1de74bcc51f738fe2b9
# Date 2015-09-25 09:08:22 +0200
# Author Quan Xu <quan.xu@intel.com>
# Committer Jan Beulich <jbeulich@suse.com>
vt-d: fix IM bit mask and unmask of Fault Event Control Register
Bit 0:29 in Fault Event Control Register are 'Reserved and Preserved',
software cannot write 0 to it unconditionally. Software must preserve
the value read for writes.
Signed-off-by: Quan Xu <quan.xu@intel.com>
Acked-by: Yang Zhang <yang.z.zhang@intel.com>
# Commit 26b300bd727ef00a8f60329212a83c3b027a48f7
# Date 2015-09-25 18:03:04 +0200
# Author Quan Xu <quan.xu@intel.com>
# Committer Jan Beulich <jbeulich@suse.com>
vt-d: fix IM bit unmask of Fault Event Control Register in init_vtd_hw()
Bit 0:29 in Fault Event Control Register are 'Reserved and Preserved',
software cannot write 0 to it unconditionally. Software must preserve
the value read for writes.
Suggested-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Quan Xu <quan.xu@intel.com>
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -991,10 +991,13 @@ static void dma_msi_unmask(struct irq_de
{
struct iommu *iommu = desc->action->dev_id;
unsigned long flags;
+ u32 sts;
/* unmask it */
spin_lock_irqsave(&iommu->register_lock, flags);
- dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
+ sts = dmar_readl(iommu->reg, DMAR_FECTL_REG);
+ sts &= ~DMA_FECTL_IM;
+ dmar_writel(iommu->reg, DMAR_FECTL_REG, sts);
spin_unlock_irqrestore(&iommu->register_lock, flags);
iommu->msi.msi_attrib.host_masked = 0;
}
@@ -1003,10 +1006,13 @@ static void dma_msi_mask(struct irq_desc
{
unsigned long flags;
struct iommu *iommu = desc->action->dev_id;
+ u32 sts;
/* mask it */
spin_lock_irqsave(&iommu->register_lock, flags);
- dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM);
+ sts = dmar_readl(iommu->reg, DMAR_FECTL_REG);
+ sts |= DMA_FECTL_IM;
+ dmar_writel(iommu->reg, DMAR_FECTL_REG, sts);
spin_unlock_irqrestore(&iommu->register_lock, flags);
iommu->msi.msi_attrib.host_masked = 1;
}
@@ -2002,6 +2008,7 @@ static int init_vtd_hw(void)
struct iommu_flush *flush = NULL;
int ret;
unsigned long flags;
+ u32 sts;
/*
* Basic VT-d HW init: set VT-d interrupt, clear VT-d faults.
@@ -2015,7 +2022,9 @@ static int init_vtd_hw(void)
clear_fault_bits(iommu);
spin_lock_irqsave(&iommu->register_lock, flags);
- dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
+ sts = dmar_readl(iommu->reg, DMAR_FECTL_REG);
+ sts &= ~DMA_FECTL_IM;
+ dmar_writel(iommu->reg, DMAR_FECTL_REG, sts);
spin_unlock_irqrestore(&iommu->register_lock, flags);
}

View File

@ -1,48 +0,0 @@
# Commit 6c0e4ad60850032c9bbd5d18b8446421c97e08e4
# Date 2015-09-29 10:25:29 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/EPT: tighten conditions of IOMMU mapping updates
Permission changes should also result in updates or TLB flushes.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -619,6 +619,7 @@ ept_set_entry(struct p2m_domain *p2m, un
uint8_t ipat = 0;
int need_modify_vtd_table = 1;
int vtd_pte_present = 0;
+ unsigned int iommu_flags = p2m_get_iommu_flags(p2mt);
enum { sync_off, sync_on, sync_check } needs_sync = sync_check;
ept_entry_t old_entry = { .epte = 0 };
ept_entry_t new_entry = { .epte = 0 };
@@ -749,8 +750,9 @@ ept_set_entry(struct p2m_domain *p2m, un
new_entry.mfn = mfn_x(mfn);
/* Safe to read-then-write because we hold the p2m lock */
- if ( ept_entry->mfn == new_entry.mfn )
- need_modify_vtd_table = 0;
+ if ( ept_entry->mfn == new_entry.mfn &&
+ p2m_get_iommu_flags(ept_entry->sa_p2mt) == iommu_flags )
+ need_modify_vtd_table = 0;
ept_p2m_type_to_flags(&new_entry, p2mt, p2ma);
}
@@ -775,11 +777,9 @@ out:
iommu_pte_flush(d, gfn, &ept_entry->epte, order, vtd_pte_present);
else
{
- unsigned int flags = p2m_get_iommu_flags(p2mt);
-
- if ( flags != 0 )
+ if ( iommu_flags )
for ( i = 0; i < (1 << order); i++ )
- iommu_map_page(d, gfn + i, mfn_x(mfn) + i, flags);
+ iommu_map_page(d, gfn + i, mfn_x(mfn) + i, iommu_flags);
else
for ( i = 0; i < (1 << order); i++ )
iommu_unmap_page(d, gfn + i);

View File

@ -1,97 +0,0 @@
# Commit 960265fbd878cdc9841473b755e4ccc9eb1942d2
# Date 2015-09-29 13:55:34 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/p2m-pt: delay freeing of intermediate page tables
Old intermediate page tables must be freed only after IOMMU side
updates/flushes have got carried out.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -486,8 +486,9 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
/* XXX -- this might be able to be faster iff current->domain == d */
void *table;
unsigned long i, gfn_remainder = gfn;
- l1_pgentry_t *p2m_entry;
- l1_pgentry_t entry_content;
+ l1_pgentry_t *p2m_entry, entry_content;
+ /* Intermediate table to free if we're replacing it with a superpage. */
+ l1_pgentry_t intermediate_entry = l1e_empty();
l2_pgentry_t l2e_content;
l3_pgentry_t l3e_content;
int rc;
@@ -535,7 +536,6 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
*/
if ( page_order == PAGE_ORDER_1G )
{
- l1_pgentry_t old_entry = l1e_empty();
p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
L3_PAGETABLE_SHIFT - PAGE_SHIFT,
L3_PAGETABLE_ENTRIES);
@@ -545,7 +545,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
{
/* We're replacing a non-SP page with a superpage. Make sure to
* handle freeing the table properly. */
- old_entry = *p2m_entry;
+ intermediate_entry = *p2m_entry;
}
ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
@@ -563,10 +563,6 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
p2m->write_p2m_entry(p2m, gfn, p2m_entry, entry_content, 3);
/* NB: paging_write_p2m_entry() handles tlb flushes properly */
-
- /* Free old intermediate tables if necessary */
- if ( l1e_get_flags(old_entry) & _PAGE_PRESENT )
- p2m_free_entry(p2m, &old_entry, page_order);
}
else
{
@@ -607,7 +603,6 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
}
else if ( page_order == PAGE_ORDER_2M )
{
- l1_pgentry_t old_entry = l1e_empty();
p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
L2_PAGETABLE_SHIFT - PAGE_SHIFT,
L2_PAGETABLE_ENTRIES);
@@ -619,7 +614,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
{
/* We're replacing a non-SP page with a superpage. Make sure to
* handle freeing the table properly. */
- old_entry = *p2m_entry;
+ intermediate_entry = *p2m_entry;
}
ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
@@ -640,10 +635,6 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
p2m->write_p2m_entry(p2m, gfn, p2m_entry, entry_content, 2);
/* NB: paging_write_p2m_entry() handles tlb flushes properly */
-
- /* Free old intermediate tables if necessary */
- if ( l1e_get_flags(old_entry) & _PAGE_PRESENT )
- p2m_free_entry(p2m, &old_entry, page_order);
}
/* Track the highest gfn for which we have ever had a valid mapping */
@@ -671,6 +662,14 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
}
}
+ /*
+ * Free old intermediate tables if necessary. This has to be the
+ * last thing we do, after removal from the IOMMU tables, so as to
+ * avoid a potential use-after-free.
+ */
+ if ( l1e_get_flags(intermediate_entry) & _PAGE_PRESENT )
+ p2m_free_entry(p2m, &intermediate_entry, page_order);
+
out:
unmap_domain_page(table);
return rc;

View File

@ -1,22 +0,0 @@
# Commit c0a85795d864dd64c116af661bf676d66ddfd5fc
# Date 2015-09-29 13:56:03 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/p2m-pt: ignore pt-share flag for shadow mode guests
There is no page table sharing in shadow mode.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -644,7 +644,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
if ( iommu_enabled && need_iommu(p2m->domain) )
{
- if ( iommu_hap_pt_share )
+ if ( iommu_use_hap_pt(p2m->domain) )
{
if ( old_mfn && (old_mfn != mfn_x(mfn)) )
amd_iommu_flush_pages(p2m->domain, gfn, page_order);

View File

@ -1,104 +0,0 @@
# Commit ea5637968a09a81a64fa5fd73ce49b4ea9789e12
# Date 2015-09-30 14:44:22 +0200
# Author Dario Faggioli <dario.faggioli@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
credit1: fix tickling when it happens from a remote pCPU
especially if that is also from a different cpupool than the
processor of the vCPU that triggered the tickling.
In fact, it is possible that we get as far as calling vcpu_unblock()-->
vcpu_wake()-->csched_vcpu_wake()-->__runq_tickle() for the vCPU 'vc',
but all while running on a pCPU that is different from 'vc->processor'.
For instance, this can happen when an HVM domain runs in a cpupool,
with a different scheduler than the default one, and issues IOREQs
to Dom0, running in Pool-0 with the default scheduler.
In fact, right in this case, the following crash can be observed:
(XEN) ----[ Xen-4.7-unstable x86_64 debug=y Tainted: C ]----
(XEN) CPU: 7
(XEN) RIP: e008:[<ffff82d0801230de>] __runq_tickle+0x18f/0x430
(XEN) RFLAGS: 0000000000010086 CONTEXT: hypervisor (d1v0)
(XEN) rax: 0000000000000001 rbx: ffff8303184fee00 rcx: 0000000000000000
(XEN) ... ... ...
(XEN) Xen stack trace from rsp=ffff83031fa57a08:
(XEN) ffff82d0801fe664 ffff82d08033c820 0000000100000002 0000000a00000001
(XEN) 0000000000006831 0000000000000000 0000000000000000 0000000000000000
(XEN) ... ... ...
(XEN) Xen call trace:
(XEN) [<ffff82d0801230de>] __runq_tickle+0x18f/0x430
(XEN) [<ffff82d08012348a>] csched_vcpu_wake+0x10b/0x110
(XEN) [<ffff82d08012b421>] vcpu_wake+0x20a/0x3ce
(XEN) [<ffff82d08012b91c>] vcpu_unblock+0x4b/0x4e
(XEN) [<ffff82d080167bd0>] vcpu_kick+0x17/0x61
(XEN) [<ffff82d080167c46>] vcpu_mark_events_pending+0x2c/0x2f
(XEN) [<ffff82d08010ac35>] evtchn_fifo_set_pending+0x381/0x3f6
(XEN) [<ffff82d08010a0f6>] notify_via_xen_event_channel+0xc9/0xd6
(XEN) [<ffff82d0801c29ed>] hvm_send_ioreq+0x3e9/0x441
(XEN) [<ffff82d0801bba7d>] hvmemul_do_io+0x23f/0x2d2
(XEN) [<ffff82d0801bbb43>] hvmemul_do_io_buffer+0x33/0x64
(XEN) [<ffff82d0801bc92b>] hvmemul_do_pio_buffer+0x35/0x37
(XEN) [<ffff82d0801cc49f>] handle_pio+0x58/0x14c
(XEN) [<ffff82d0801eabcb>] vmx_vmexit_handler+0x16b3/0x1bea
(XEN) [<ffff82d0801efd21>] vmx_asm_vmexit_handler+0x41/0xc0
In this case, pCPU 7 is not in Pool-0, while the (Dom0's) vCPU being
woken is. pCPU's 7 pool has a different scheduler than credit, but it
is, however, right from pCPU 7 that we are waking the Dom0's vCPUs.
Therefore, the current code tries to access csched_balance_mask for
pCPU 7, but that is not defined, and hence the Oops.
(Note that, in case the two pools run the same scheduler we see no
Oops, but things are still conceptually wrong.)
Cure things by making the csched_balance_mask macro accept a
parameter for fetching a specific pCPU's mask (instead than always
using smp_processor_id()).
Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -154,10 +154,10 @@ struct csched_pcpu {
* Convenience macro for accessing the per-PCPU cpumask we need for
* implementing the two steps (soft and hard affinity) balancing logic.
* It is stored in csched_pcpu so that serialization is not an issue,
- * as there is a csched_pcpu for each PCPU and we always hold the
- * runqueue spin-lock when using this.
+ * as there is a csched_pcpu for each PCPU, and we always hold the
+ * runqueue lock for the proper PCPU when using this.
*/
-#define csched_balance_mask (CSCHED_PCPU(smp_processor_id())->balance_mask)
+#define csched_balance_mask(c) (CSCHED_PCPU(c)->balance_mask)
/*
* Virtual CPU
@@ -396,9 +396,10 @@ __runq_tickle(unsigned int cpu, struct c
/* Are there idlers suitable for new (for this balance step)? */
csched_balance_cpumask(new->vcpu, balance_step,
- csched_balance_mask);
- cpumask_and(csched_balance_mask, csched_balance_mask, &idle_mask);
- new_idlers_empty = cpumask_empty(csched_balance_mask);
+ csched_balance_mask(cpu));
+ cpumask_and(csched_balance_mask(cpu),
+ csched_balance_mask(cpu), &idle_mask);
+ new_idlers_empty = cpumask_empty(csched_balance_mask(cpu));
/*
* Let's not be too harsh! If there aren't idlers suitable
@@ -1475,8 +1476,9 @@ csched_runq_steal(int peer_cpu, int cpu,
&& !__vcpu_has_soft_affinity(vc, vc->cpu_hard_affinity) )
continue;
- csched_balance_cpumask(vc, balance_step, csched_balance_mask);
- if ( __csched_vcpu_is_migrateable(vc, cpu, csched_balance_mask) )
+ csched_balance_cpumask(vc, balance_step, csched_balance_mask(cpu));
+ if ( __csched_vcpu_is_migrateable(vc, cpu,
+ csched_balance_mask(cpu)) )
{
/* We got a candidate. Grab it! */
TRACE_3D(TRC_CSCHED_STOLEN_VCPU, peer_cpu,

View File

@ -1,159 +0,0 @@
# Commit 660fd65d5578a95ec5eac522128bba23325179eb
# Date 2015-10-02 13:40:36 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/p2m-pt: tighten conditions of IOMMU mapping updates
Whether the MFN changes does not depend on the new entry being valid
(but solely on the old one), and the need to update or TLB-flush also
depends on permission changes.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -493,7 +493,18 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
l3_pgentry_t l3e_content;
int rc;
unsigned int iommu_pte_flags = p2m_get_iommu_flags(p2mt);
- unsigned long old_mfn = 0;
+ /*
+ * old_mfn and iommu_old_flags control possible flush/update needs on the
+ * IOMMU: We need to flush when MFN or flags (i.e. permissions) change.
+ * iommu_old_flags being initialized to zero covers the case of the entry
+ * getting replaced being a non-present (leaf or intermediate) one. For
+ * present leaf entries the real value will get calculated below, while
+ * for present intermediate entries ~0 (guaranteed != iommu_pte_flags)
+ * will be used (to cover all cases of what the leaf entries underneath
+ * the intermediate one might be).
+ */
+ unsigned int flags, iommu_old_flags = 0;
+ unsigned long old_mfn = INVALID_MFN;
if ( tb_init_done )
{
@@ -540,12 +551,20 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
L3_PAGETABLE_SHIFT - PAGE_SHIFT,
L3_PAGETABLE_ENTRIES);
ASSERT(p2m_entry);
- if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
- !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+ flags = l1e_get_flags(*p2m_entry);
+ if ( flags & _PAGE_PRESENT )
{
- /* We're replacing a non-SP page with a superpage. Make sure to
- * handle freeing the table properly. */
- intermediate_entry = *p2m_entry;
+ if ( flags & _PAGE_PSE )
+ {
+ iommu_old_flags =
+ p2m_get_iommu_flags(p2m_flags_to_type(flags));
+ old_mfn = l1e_get_pfn(*p2m_entry);
+ }
+ else
+ {
+ iommu_old_flags = ~0;
+ intermediate_entry = *p2m_entry;
+ }
}
ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
@@ -556,10 +575,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
entry_content.l1 = l3e_content.l3;
if ( entry_content.l1 != 0 )
- {
p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
- old_mfn = l1e_get_pfn(*p2m_entry);
- }
p2m->write_p2m_entry(p2m, gfn, p2m_entry, entry_content, 3);
/* NB: paging_write_p2m_entry() handles tlb flushes properly */
@@ -584,7 +600,10 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
0, L1_PAGETABLE_ENTRIES);
ASSERT(p2m_entry);
-
+ iommu_old_flags =
+ p2m_get_iommu_flags(p2m_flags_to_type(l1e_get_flags(*p2m_entry)));
+ old_mfn = l1e_get_pfn(*p2m_entry);
+
if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct)
|| p2m_is_paging(p2mt) )
entry_content = p2m_l1e_from_pfn(mfn_x(mfn),
@@ -593,10 +612,8 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
entry_content = l1e_empty();
if ( entry_content.l1 != 0 )
- {
p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
- old_mfn = l1e_get_pfn(*p2m_entry);
- }
+
/* level 1 entry */
p2m->write_p2m_entry(p2m, gfn, p2m_entry, entry_content, 1);
/* NB: paging_write_p2m_entry() handles tlb flushes properly */
@@ -607,14 +624,20 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
L2_PAGETABLE_SHIFT - PAGE_SHIFT,
L2_PAGETABLE_ENTRIES);
ASSERT(p2m_entry);
-
- /* FIXME: Deal with 4k replaced by 2meg pages */
- if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
- !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
- {
- /* We're replacing a non-SP page with a superpage. Make sure to
- * handle freeing the table properly. */
- intermediate_entry = *p2m_entry;
+ flags = l1e_get_flags(*p2m_entry);
+ if ( flags & _PAGE_PRESENT )
+ {
+ if ( flags & _PAGE_PSE )
+ {
+ iommu_old_flags =
+ p2m_get_iommu_flags(p2m_flags_to_type(flags));
+ old_mfn = l1e_get_pfn(*p2m_entry);
+ }
+ else
+ {
+ iommu_old_flags = ~0;
+ intermediate_entry = *p2m_entry;
+ }
}
ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
@@ -628,10 +651,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
entry_content.l1 = l2e_content.l2;
if ( entry_content.l1 != 0 )
- {
p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
- old_mfn = l1e_get_pfn(*p2m_entry);
- }
p2m->write_p2m_entry(p2m, gfn, p2m_entry, entry_content, 2);
/* NB: paging_write_p2m_entry() handles tlb flushes properly */
@@ -642,17 +662,17 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
&& (gfn + (1UL << page_order) - 1 > p2m->max_mapped_pfn) )
p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1;
- if ( iommu_enabled && need_iommu(p2m->domain) )
+ if ( iommu_enabled && need_iommu(p2m->domain) &&
+ (iommu_old_flags != iommu_pte_flags || old_mfn != mfn_x(mfn)) )
{
if ( iommu_use_hap_pt(p2m->domain) )
{
- if ( old_mfn && (old_mfn != mfn_x(mfn)) )
+ if ( iommu_old_flags )
amd_iommu_flush_pages(p2m->domain, gfn, page_order);
}
else
{
- unsigned int flags = p2m_get_iommu_flags(p2mt);
-
+ flags = p2m_get_iommu_flags(p2mt);
if ( flags != 0 )
for ( i = 0; i < (1UL << page_order); i++ )
iommu_map_page(p2m->domain, gfn+i, mfn_x(mfn)+i, flags);

View File

@ -17,9 +17,11 @@ Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Acked-by: Yang Zhang <yang.z.zhang@intel.com>
--- a/xen/arch/x86/apic.c
+++ b/xen/arch/x86/apic.c
@@ -946,8 +946,18 @@ void __init x2apic_bsp_setup(void)
Index: xen-4.6.0-testing/xen/arch/x86/apic.c
===================================================================
--- xen-4.6.0-testing.orig/xen/arch/x86/apic.c
+++ xen-4.6.0-testing/xen/arch/x86/apic.c
@@ -943,8 +943,18 @@ void __init x2apic_bsp_setup(void)
mask_8259A();
mask_IO_APIC_setup(ioapic_entries);
@ -39,9 +41,11 @@ Acked-by: Yang Zhang <yang.z.zhang@intel.com>
if ( x2apic_enabled )
panic("Interrupt remapping could not be enabled while "
"x2APIC is already enabled by BIOS");
--- a/xen/drivers/passthrough/vtd/intremap.c
+++ b/xen/drivers/passthrough/vtd/intremap.c
@@ -144,10 +144,10 @@ static void set_hpet_source_id(unsigned
Index: xen-4.6.0-testing/xen/drivers/passthrough/vtd/intremap.c
===================================================================
--- xen-4.6.0-testing.orig/xen/drivers/passthrough/vtd/intremap.c
+++ xen-4.6.0-testing/xen/drivers/passthrough/vtd/intremap.c
@@ -143,10 +143,10 @@ static void set_hpet_source_id(unsigned
set_ire_sid(ire, SVT_VERIFY_SID_SQ, SQ_13_IGNORE_3, hpetid_to_bdf(id));
}
@ -54,7 +58,7 @@ Acked-by: Yang Zhang <yang.z.zhang@intel.com>
if ( !iommu_qinval || !iommu_intremap || list_empty(&acpi_drhd_units) )
return 0;
@@ -155,12 +155,12 @@ int iommu_supports_eim(void)
@@ -154,12 +154,12 @@ int iommu_supports_eim(void)
/* We MUST have a DRHD unit for each IOAPIC. */
for ( apic = 0; apic < nr_ioapics; apic++ )
if ( !ioapic_to_drhd(IO_APIC_ID(apic)) )
@ -69,7 +73,7 @@ Acked-by: Yang Zhang <yang.z.zhang@intel.com>
for_each_drhd_unit ( drhd )
if ( !ecap_queued_inval(drhd->iommu->ecap) ||
@@ -834,10 +834,10 @@ int iommu_enable_x2apic_IR(void)
@@ -833,10 +833,10 @@ int iommu_enable_x2apic_IR(void)
struct iommu *iommu;
if ( !iommu_supports_eim() )
@ -82,7 +86,7 @@ Acked-by: Yang Zhang <yang.z.zhang@intel.com>
for_each_drhd_unit ( drhd )
{
@@ -862,7 +862,7 @@ int iommu_enable_x2apic_IR(void)
@@ -861,7 +861,7 @@ int iommu_enable_x2apic_IR(void)
{
dprintk(XENLOG_INFO VTDPREFIX,
"Failed to enable Queued Invalidation!\n");
@ -91,7 +95,7 @@ Acked-by: Yang Zhang <yang.z.zhang@intel.com>
}
}
@@ -874,7 +874,7 @@ int iommu_enable_x2apic_IR(void)
@@ -873,7 +873,7 @@ int iommu_enable_x2apic_IR(void)
{
dprintk(XENLOG_INFO VTDPREFIX,
"Failed to enable Interrupt Remapping!\n");
@ -100,9 +104,11 @@ Acked-by: Yang Zhang <yang.z.zhang@intel.com>
}
}
--- a/xen/include/asm-x86/iommu.h
+++ b/xen/include/asm-x86/iommu.h
@@ -28,7 +28,7 @@ int iommu_setup_hpet_msi(struct msi_desc
Index: xen-4.6.0-testing/xen/include/asm-x86/iommu.h
===================================================================
--- xen-4.6.0-testing.orig/xen/include/asm-x86/iommu.h
+++ xen-4.6.0-testing/xen/include/asm-x86/iommu.h
@@ -27,7 +27,7 @@ int iommu_setup_hpet_msi(struct msi_desc
/* While VT-d specific, this must get declared in a generic header. */
int adjust_vtd_irq_affinities(void);
void iommu_pte_flush(struct domain *d, u64 gfn, u64 *pte, int order, int present);

View File

@ -12,17 +12,17 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -904,6 +904,7 @@ void pv_cpuid(struct cpu_user_regs *regs
@@ -967,6 +967,7 @@ void pv_cpuid(struct cpu_user_regs *regs
__clear_bit(X86_FEATURE_LWP % 32, &c);
__clear_bit(X86_FEATURE_NODEID_MSR % 32, &c);
__clear_bit(X86_FEATURE_TOPOEXT % 32, &c);
+ __clear_bit(X86_FEATURE_MWAITX % 32, &c);
break;
case 0x00000005: /* MONITOR/MWAIT */
case 0x0000000a: /* Architectural Performance Monitor Features (Intel) */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -137,6 +137,7 @@
@@ -135,6 +135,7 @@
#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */
#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */
#define X86_FEATURE_DBEXT (6*32+26) /* data breakpoint extension */

View File

@ -16,7 +16,7 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/numa.c
+++ b/xen/arch/x86/numa.c
@@ -347,7 +347,7 @@ void __init init_cpu_to_node(void)
@@ -349,7 +349,7 @@ void __init init_cpu_to_node(void)
u32 apicid = x86_cpu_to_apicid[i];
if ( apicid == BAD_APICID )
continue;
@ -27,8 +27,8 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
numa_set_node(i, node);
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -191,7 +191,7 @@ void __devinit srat_detect_node(int cpu)
unsigned node;
@@ -200,7 +200,7 @@ void __devinit srat_detect_node(int cpu)
nodeid_t node;
u32 apicid = x86_cpu_to_apicid[cpu];
- node = apicid_to_node[apicid];
@ -38,7 +38,7 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -885,7 +885,8 @@ int cpu_add(uint32_t apic_id, uint32_t a
@@ -993,7 +993,8 @@ int cpu_add(uint32_t apic_id, uint32_t a
cpu = node;
goto out;
}
@ -50,15 +50,15 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
/* Physically added CPUs do not have synchronised TSC. */
--- a/xen/arch/x86/srat.c
+++ b/xen/arch/x86/srat.c
@@ -170,7 +170,6 @@ void __init
acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
@@ -209,7 +209,6 @@ acpi_numa_x2apic_affinity_init(struct ac
{
int pxm, node;
- int apic_id;
unsigned pxm;
nodeid_t node;
- u32 apic_id;
if (srat_disabled())
return;
@@ -178,8 +177,13 @@ acpi_numa_x2apic_affinity_init(struct ac
@@ -217,8 +216,13 @@ acpi_numa_x2apic_affinity_init(struct ac
bad_srat();
return;
}
@ -72,8 +72,8 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
pxm = pa->proximity_domain;
node = setup_node(pxm);
if (node < 0) {
@@ -187,11 +191,11 @@ acpi_numa_x2apic_affinity_init(struct ac
if (node == NUMA_NO_NODE) {
@@ -226,11 +230,11 @@ acpi_numa_x2apic_affinity_init(struct ac
return;
}
@ -89,7 +89,7 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
}
/* Callback for Proximity Domain -> LAPIC mapping */
@@ -221,7 +225,7 @@ acpi_numa_processor_affinity_init(struct
@@ -262,7 +266,7 @@ acpi_numa_processor_affinity_init(struct
apicid_to_node[pa->apic_id] = node;
node_set(node, processor_nodes_parsed);
acpi_numa = 1;
@ -100,7 +100,7 @@ Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/drivers/acpi/numa.c
+++ b/xen/drivers/acpi/numa.c
@@ -199,9 +199,9 @@ int __init acpi_numa_init(void)
@@ -198,9 +198,9 @@ int __init acpi_numa_init(void)
/* SRAT: Static Resource Affinity Table */
if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,

View File

@ -0,0 +1,49 @@
# Commit 29bcf64ce8bc0b1b7aacd00c8668f255c4f0686c
# Date 2015-10-29 13:31:10 +0100
# Author Julien Grall <julien.grall@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
arm: Support hypercall_create_continuation for multicall
Multicall for ARM has been supported since commit f0dbdc6 "xen: arm: fully
implement multicall interface.". Although, if an hypercall in multicall
requires preemption, it will crash the host:
(XEN) Xen BUG at domain.c:347
(XEN) ----[ Xen-4.7-unstable arm64 debug=y Tainted: C ]----
[...]
(XEN) Xen call trace:
(XEN) [<00000000002420cc>] hypercall_create_continuation+0x64/0x380 (PC)
(XEN) [<0000000000217274>] do_memory_op+0x1b00/0x2334 (LR)
(XEN) [<0000000000250d2c>] do_multicall_call+0x114/0x124
(XEN) [<0000000000217ff0>] do_multicall+0x17c/0x23c
(XEN) [<000000000024f97c>] do_trap_hypercall+0x90/0x12c
(XEN) [<0000000000251ca8>] do_trap_hypervisor+0xd2c/0x1ba4
(XEN) [<00000000002582cc>] guest_sync+0x88/0xb8
(XEN)
(XEN)
(XEN) ****************************************
(XEN) Panic on CPU 5:
(XEN) Xen BUG at domain.c:347
(XEN) ****************************************
(XEN)
(XEN) Manual reset required ('noreboot' specified)
Looking to the code, the support of multicall looks valid to me, as we only
need to fill call.args[...]. So drop the BUG();
This is CVE-2015-7812 / XSA-145.
Signed-off-by: Julien Grall <julien.grall@citrix.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -344,8 +344,6 @@ unsigned long hypercall_create_continuat
if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
{
- BUG(); /* XXX multicalls not implemented yet. */
-
__set_bit(_MCSF_call_preempted, &mcs->flags);
for ( i = 0; *p != '\0'; i++ )

View File

@ -0,0 +1,42 @@
# Commit 1c0e59ff15764e7b0c59282365974f5b8924ce83
# Date 2015-10-29 13:33:38 +0100
# Author Ian Campbell <ian.campbell@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
arm: rate-limit logging from unimplemented PHYSDEVOP and HVMOP.
These are guest accessible and should therefore be rate-limited.
Moreover, include them only in debug builds.
This is CVE-2015-7813 / XSA-146.
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/arm/hvm.c
+++ b/xen/arch/arm/hvm.c
@@ -57,7 +57,7 @@ long do_hvm_op(unsigned long op, XEN_GUE
default:
{
- printk("%s: Bad HVM op %ld.\n", __func__, op);
+ gdprintk(XENLOG_DEBUG, "HVMOP op=%lu: not implemented\n", op);
rc = -ENOSYS;
break;
}
--- a/xen/arch/arm/physdev.c
+++ b/xen/arch/arm/physdev.c
@@ -8,12 +8,13 @@
#include <xen/types.h>
#include <xen/lib.h>
#include <xen/errno.h>
+#include <xen/sched.h>
#include <asm/hypercall.h>
int do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
{
- printk("%s %d cmd=%d: not implemented yet\n", __func__, __LINE__, cmd);
+ gdprintk(XENLOG_DEBUG, "PHYSDEVOP cmd=%d: not implemented\n", cmd);
return -ENOSYS;
}

View File

@ -0,0 +1,40 @@
# Commit 1ef01396fdff88b1c3331a09ca5c69619b90f4ea
# Date 2015-10-29 13:34:17 +0100
# Author Ian Campbell <ian.campbell@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
arm: handle races between relinquish_memory and free_domheap_pages
Primarily this means XENMEM_decrease_reservation from a toolstack
domain.
Unlike x86 we have no requirement right now to queue such pages onto
a separate list, if we hit this race then the other code has already
fully accepted responsibility for freeing this page and therefore
there is no more for relinquish_memory to do.
This is CVE-2015-7814 / XSA-147.
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Reviewed-by: Julien Grall <julien.grall@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -768,8 +768,15 @@ static int relinquish_memory(struct doma
{
/* Grab a reference to the page so it won't disappear from under us. */
if ( unlikely(!get_page(page, d)) )
- /* Couldn't get a reference -- someone is freeing this page. */
- BUG();
+ /*
+ * Couldn't get a reference -- someone is freeing this page and
+ * has already committed to doing so, so no more to do here.
+ *
+ * Note that the page must be left on the list, a list_del
+ * here will clash with the list_del done by the other
+ * party in the race and corrupt the list head.
+ */
+ continue;
if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
put_page(page);

View File

@ -1,5 +1,7 @@
References: bsc#950367 CVE-2015-7835 XSA-148
# Commit fe360c90ea13f309ef78810f1a2b92f2ae3b30b8
# Date 2015-10-29 13:35:07 +0100
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86: guard against undue super page PTE creation
When optional super page support got added (commit bd1cd81d64 "x86: PV
@ -10,14 +12,13 @@ unconditionally.
This is CVE-2015-7835 / XSA-148.
Reported-by: "栾尚聪(好风)" <shangcong.lsc@alibaba-inc.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Tim Deegan <tim@xen.org>
Index: xen-4.5.1-testing/xen/arch/x86/mm.c
===================================================================
--- xen-4.5.1-testing.orig/xen/arch/x86/mm.c
+++ xen-4.5.1-testing/xen/arch/x86/mm.c
@@ -162,7 +162,10 @@ static void put_superpage(unsigned long
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -160,7 +160,10 @@ static void put_superpage(unsigned long
static uint32_t base_disallow_mask;
/* Global bit is allowed to be set on L1 PTEs. Intended for user mappings. */
#define L1_DISALLOW_MASK ((base_disallow_mask | _PAGE_GNTTAB) & ~_PAGE_GLOBAL)
@ -27,9 +28,9 @@ Index: xen-4.5.1-testing/xen/arch/x86/mm.c
+ ? base_disallow_mask & ~_PAGE_PSE \
+ : base_disallow_mask)
#define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
base_disallow_mask : \
@@ -1790,7 +1793,10 @@ static int mod_l2_entry(l2_pgentry_t *pl
#define l3_disallow_mask(d) (!is_pv_32bit_domain(d) ? \
base_disallow_mask : 0xFFFFF198U)
@@ -1839,7 +1842,10 @@ static int mod_l2_entry(l2_pgentry_t *pl
}
/* Fast path for identical mapping and presence. */

View File

@ -0,0 +1,25 @@
# Commit d46896ebbb23f3a9fef2eb6066ae614fd1acfd96
# Date 2015-10-29 13:35:40 +0100
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
free domain's vcpu array
This was overlooked in fb442e2171 ("x86_64: allow more vCPU-s per
guest").
This is CVE-2015-7969 / XSA-149.
Reported-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Ian Campbell <ian.campbell@citrix.com>
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -833,6 +833,7 @@ static void complete_domain_destroy(stru
xsm_free_security_domain(d);
free_cpumask_var(d->domain_dirty_cpumask);
+ xfree(d->vcpu);
free_domain_struct(d);
send_global_virq(VIRQ_DOM_EXC);

View File

@ -0,0 +1,205 @@
# Commit 101ce53266866144e724ed593173bc4098b300b9
# Date 2015-10-29 13:36:25 +0100
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/PoD: Eager sweep for zeroed pages
Based on the contents of a guests physical address space,
p2m_pod_emergency_sweep() could degrade into a linear memcmp() from 0 to
max_gfn, which runs non-preemptibly.
As p2m_pod_emergency_sweep() runs behind the scenes in a number of contexts,
making it preemptible is not feasible.
Instead, a different approach is taken. Recently-populated pages are eagerly
checked for reclaimation, which amortises the p2m_pod_emergency_sweep()
operation across each p2m_pod_demand_populate() operation.
Note that in the case that a 2M superpage can't be reclaimed as a superpage,
it is shattered if 4K pages of zeros can be reclaimed. This is unfortunate
but matches the previous behaviour, and is required to avoid regressions
(domain crash from PoD exhaustion) with VMs configured close to the limit.
This is CVE-2015-7970 / XSA-150.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
--- a/xen/arch/x86/mm/p2m-pod.c
+++ b/xen/arch/x86/mm/p2m-pod.c
@@ -901,28 +901,6 @@ p2m_pod_zero_check(struct p2m_domain *p2
}
#define POD_SWEEP_LIMIT 1024
-
-/* When populating a new superpage, look at recently populated superpages
- * hoping that they've been zeroed. This will snap up zeroed pages as soon as
- * the guest OS is done with them. */
-static void
-p2m_pod_check_last_super(struct p2m_domain *p2m, unsigned long gfn_aligned)
-{
- unsigned long check_gfn;
-
- ASSERT(p2m->pod.last_populated_index < POD_HISTORY_MAX);
-
- check_gfn = p2m->pod.last_populated[p2m->pod.last_populated_index];
-
- p2m->pod.last_populated[p2m->pod.last_populated_index] = gfn_aligned;
-
- p2m->pod.last_populated_index =
- ( p2m->pod.last_populated_index + 1 ) % POD_HISTORY_MAX;
-
- p2m_pod_zero_check_superpage(p2m, check_gfn);
-}
-
-
#define POD_SWEEP_STRIDE 16
static void
p2m_pod_emergency_sweep(struct p2m_domain *p2m)
@@ -963,7 +941,7 @@ p2m_pod_emergency_sweep(struct p2m_domai
* NB that this is a zero-sum game; we're increasing our cache size
* by re-increasing our 'debt'. Since we hold the pod lock,
* (entry_count - count) must remain the same. */
- if ( p2m->pod.count > 0 && i < limit )
+ if ( i < limit && (p2m->pod.count > 0 || hypercall_preempt_check()) )
break;
}
@@ -975,6 +953,58 @@ p2m_pod_emergency_sweep(struct p2m_domai
}
+static void pod_eager_reclaim(struct p2m_domain *p2m)
+{
+ struct pod_mrp_list *mrp = &p2m->pod.mrp;
+ unsigned int i = 0;
+
+ /*
+ * Always check one page for reclaimation.
+ *
+ * If the PoD pool is empty, keep checking some space is found, or all
+ * entries have been exhaused.
+ */
+ do
+ {
+ unsigned int idx = (mrp->idx + i++) % ARRAY_SIZE(mrp->list);
+ unsigned long gfn = mrp->list[idx];
+
+ if ( gfn != INVALID_GFN )
+ {
+ if ( gfn & POD_LAST_SUPERPAGE )
+ {
+ gfn &= ~POD_LAST_SUPERPAGE;
+
+ if ( p2m_pod_zero_check_superpage(p2m, gfn) == 0 )
+ {
+ unsigned int x;
+
+ for ( x = 0; x < SUPERPAGE_PAGES; ++x, ++gfn )
+ p2m_pod_zero_check(p2m, &gfn, 1);
+ }
+ }
+ else
+ p2m_pod_zero_check(p2m, &gfn, 1);
+
+ mrp->list[idx] = INVALID_GFN;
+ }
+
+ } while ( (p2m->pod.count == 0) && (i < ARRAY_SIZE(mrp->list)) );
+}
+
+static void pod_eager_record(struct p2m_domain *p2m,
+ unsigned long gfn, unsigned int order)
+{
+ struct pod_mrp_list *mrp = &p2m->pod.mrp;
+
+ ASSERT(mrp->list[mrp->idx] == INVALID_GFN);
+ ASSERT(gfn != INVALID_GFN);
+
+ mrp->list[mrp->idx++] =
+ gfn | (order == PAGE_ORDER_2M ? POD_LAST_SUPERPAGE : 0);
+ mrp->idx %= ARRAY_SIZE(mrp->list);
+}
+
int
p2m_pod_demand_populate(struct p2m_domain *p2m, unsigned long gfn,
unsigned int order,
@@ -1015,6 +1045,8 @@ p2m_pod_demand_populate(struct p2m_domai
return 0;
}
+ pod_eager_reclaim(p2m);
+
/* Only sweep if we're actually out of memory. Doing anything else
* causes unnecessary time and fragmentation of superpages in the p2m. */
if ( p2m->pod.count == 0 )
@@ -1051,6 +1083,8 @@ p2m_pod_demand_populate(struct p2m_domai
p2m->pod.entry_count -= (1 << order);
BUG_ON(p2m->pod.entry_count < 0);
+ pod_eager_record(p2m, gfn_aligned, order);
+
if ( tb_init_done )
{
struct {
@@ -1066,12 +1100,6 @@ p2m_pod_demand_populate(struct p2m_domai
__trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t);
}
- /* Check the last guest demand-populate */
- if ( p2m->pod.entry_count > p2m->pod.count
- && (order == PAGE_ORDER_2M)
- && (q & P2M_ALLOC) )
- p2m_pod_check_last_super(p2m, gfn_aligned);
-
pod_unlock(p2m);
return 0;
out_of_memory:
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -60,6 +60,7 @@ boolean_param("hap_2mb", opt_hap_2mb);
/* Init the datastructures for later use by the p2m code */
static int p2m_initialise(struct domain *d, struct p2m_domain *p2m)
{
+ unsigned int i;
int ret = 0;
mm_rwlock_init(&p2m->lock);
@@ -75,6 +76,9 @@ static int p2m_initialise(struct domain
p2m->np2m_base = P2M_BASE_EADDR;
+ for ( i = 0; i < ARRAY_SIZE(p2m->pod.mrp.list); ++i )
+ p2m->pod.mrp.list[i] = INVALID_GFN;
+
if ( hap_enabled(d) && cpu_has_vmx )
ret = ept_p2m_init(p2m);
else
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -292,10 +292,20 @@ struct p2m_domain {
entry_count; /* # of pages in p2m marked pod */
unsigned long reclaim_single; /* Last gpfn of a scan */
unsigned long max_guest; /* gpfn of max guest demand-populate */
-#define POD_HISTORY_MAX 128
- /* gpfn of last guest superpage demand-populated */
- unsigned long last_populated[POD_HISTORY_MAX];
- unsigned int last_populated_index;
+
+ /*
+ * Tracking of the most recently populated PoD pages, for eager
+ * reclamation.
+ */
+ struct pod_mrp_list {
+#define NR_POD_MRP_ENTRIES 32
+
+/* Encode ORDER_2M superpage in top bit of GFN */
+#define POD_LAST_SUPERPAGE (INVALID_GFN & ~(INVALID_GFN >> 1))
+
+ unsigned long list[NR_POD_MRP_ENTRIES];
+ unsigned int idx;
+ } mrp;
mm_lock_t lock; /* Locking of private pod structs, *
* not relying on the p2m lock. */
} pod;

View File

@ -1,17 +1,19 @@
# Commit 6e97c4b37386c2d09e09e9b5d5d232e37728b960
# Date 2015-10-29 13:36:52 +0100
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
xenoprof: free domain's vcpu array
This was overlooked in fb442e2171 ("x86_64: allow more vCPU-s per
guest").
This is XSA-151.
This is CVE-2015-7969 / XSA-151.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Ian Campbell <ian.campbell@citrix.com>
Index: xen-4.5.1-testing/xen/common/xenoprof.c
===================================================================
--- xen-4.5.1-testing.orig/xen/common/xenoprof.c
+++ xen-4.5.1-testing/xen/common/xenoprof.c
--- a/xen/common/xenoprof.c
+++ b/xen/common/xenoprof.c
@@ -239,6 +239,7 @@ static int alloc_xenoprof_struct(
d->xenoprof->rawbuf = alloc_xenheap_pages(get_order_from_pages(npages), 0);
if ( d->xenoprof->rawbuf == NULL )

View File

@ -1,3 +1,7 @@
# Commit 95e7415843b94c346e5ba8682665f508f220e04b
# Date 2015-10-29 13:37:19 +0100
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86: rate-limit logging in do_xen{oprof,pmu}_op()
Some of the sub-ops are acessible to all guests, and hence should be
@ -5,14 +9,37 @@ rate-limited. In the xenoprof case, just like for XSA-146, include them
only in debug builds. Since the vPMU code is rather new, allow them to
be always present, but downgrade them to (rate limited) guest messages.
This is XSA-152.
This is CVE-2015-7971 / XSA-152.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Ian Campbell <ian.campbell@citrix.com>
Index: xen-4.5.1-testing/xen/common/xenoprof.c
===================================================================
--- xen-4.5.1-testing.orig/xen/common/xenoprof.c
+++ xen-4.5.1-testing/xen/common/xenoprof.c
--- a/xen/arch/x86/cpu/vpmu.c
+++ b/xen/arch/x86/cpu/vpmu.c
@@ -682,8 +682,8 @@ long do_xenpmu_op(unsigned int op, XEN_G
vpmu_mode = pmu_params.val;
else if ( vpmu_mode != pmu_params.val )
{
- printk(XENLOG_WARNING
- "VPMU: Cannot change mode while active VPMUs exist\n");
+ gprintk(XENLOG_WARNING,
+ "VPMU: Cannot change mode while active VPMUs exist\n");
ret = -EBUSY;
}
@@ -714,8 +714,8 @@ long do_xenpmu_op(unsigned int op, XEN_G
vpmu_features = pmu_params.val;
else
{
- printk(XENLOG_WARNING "VPMU: Cannot change features while"
- " active VPMUs exist\n");
+ gprintk(XENLOG_WARNING,
+ "VPMU: Cannot change features while active VPMUs exist\n");
ret = -EBUSY;
}
--- a/xen/common/xenoprof.c
+++ b/xen/common/xenoprof.c
@@ -676,15 +676,13 @@ ret_t do_xenoprof_op(int op, XEN_GUEST_H
if ( (op < 0) || (op > XENOPROF_last_op) )

View File

@ -1,7 +1,8 @@
From 27593ec62bdad8621df910931349d964a6dbaa8c Mon Sep 17 00:00:00 2001
From: Ian Jackson <ian.jackson@eu.citrix.com>
Date: Wed, 21 Oct 2015 16:18:30 +0100
Subject: [PATCH XSA-153 v3] libxl: adjust PoD target by memory fudge, too
# Commit e294a0c3af9f4443dc692b180fb1771b1cb075e8
# Date 2015-10-29 15:11:51 +0000
# Author Ian Jackson <ian.jackson@eu.citrix.com>
# Committer Ian Jackson <Ian.Jackson@eu.citrix.com>
libxl: adjust PoD target by memory fudge, too
PoD guests need to balloon at least as far as required by PoD, or risk
crashing. Currently they don't necessarily know what the right value
@ -32,37 +33,30 @@ probably also in stable trees.
This is XSA-153.
Signed-off-by: Ian Jackson <Ian.Jackson@eu.citrix.com>
---
tools/libxl/libxl.c | 2 +-
tools/libxl/libxl_dom.c | 9 ++++++++-
2 files changed, 9 insertions(+), 2 deletions(-)
(cherry picked from commit 56fb5fd62320eb40a7517206f9706aa9188d6f7b)
Index: xen-4.5.1-testing/tools/libxl/libxl.c
===================================================================
--- xen-4.5.1-testing.orig/tools/libxl/libxl.c
+++ xen-4.5.1-testing/tools/libxl/libxl.c
@@ -4859,7 +4859,7 @@ retry_transaction:
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -4815,7 +4815,7 @@ retry_transaction:
}
new_target_memkb -= videoram;
rc = xc_domain_set_pod_target(ctx->xch, domid,
- new_target_memkb / 4, NULL, NULL, NULL);
+ (new_target_memkb + LIBXL_MAXMEM_CONSTANT) / 4, NULL, NULL, NULL);
if (rc != 0) {
LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR,
"xc_domain_set_pod_target domid=%d, memkb=%d "
Index: xen-4.5.1-testing/tools/libxl/libxl_dom.c
===================================================================
--- xen-4.5.1-testing.orig/tools/libxl/libxl_dom.c
+++ xen-4.5.1-testing/tools/libxl/libxl_dom.c
@@ -446,6 +446,7 @@ int libxl__build_post(libxl__gc *gc, uin
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -486,6 +486,7 @@ int libxl__build_post(libxl__gc *gc, uin
xs_transaction_t t;
char **ents;
int i, rc;
+ int64_t mem_target_fudge;
rc = libxl_domain_sched_params_set(CTX, domid, &info->sched_params);
if (rc)
@@ -472,11 +473,17 @@ int libxl__build_post(libxl__gc *gc, uin
if (info->num_vnuma_nodes && !info->num_vcpu_soft_affinity) {
rc = set_vnuma_affinity(gc, domid, info);
@@ -518,11 +519,17 @@ int libxl__build_post(libxl__gc *gc, uin
}
}

View File

@ -0,0 +1,88 @@
# Commit 59a5061723ba47c0028cf48487e5de551c42a378
# Date 2015-11-02 15:33:38 +0100
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/PoD: Make p2m_pod_empty_cache() restartable
This avoids a long running operation when destroying a domain with a
large PoD cache.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
--- a/xen/arch/x86/mm/p2m-pod.c
+++ b/xen/arch/x86/mm/p2m-pod.c
@@ -375,11 +375,11 @@ out:
return ret;
}
-void
-p2m_pod_empty_cache(struct domain *d)
+int p2m_pod_empty_cache(struct domain *d)
{
struct p2m_domain *p2m = p2m_get_hostp2m(d);
struct page_info *page;
+ unsigned int i;
/* After this barrier no new PoD activities can happen. */
BUG_ON(!d->is_dying);
@@ -389,8 +389,6 @@ p2m_pod_empty_cache(struct domain *d)
while ( (page = page_list_remove_head(&p2m->pod.super)) )
{
- int i;
-
for ( i = 0 ; i < SUPERPAGE_PAGES ; i++ )
{
BUG_ON(page_get_owner(page + i) != d);
@@ -398,19 +396,27 @@ p2m_pod_empty_cache(struct domain *d)
}
p2m->pod.count -= SUPERPAGE_PAGES;
+
+ if ( hypercall_preempt_check() )
+ goto out;
}
- while ( (page = page_list_remove_head(&p2m->pod.single)) )
+ for ( i = 0; (page = page_list_remove_head(&p2m->pod.single)); ++i )
{
BUG_ON(page_get_owner(page) != d);
page_list_add_tail(page, &d->page_list);
p2m->pod.count -= 1;
+
+ if ( i && !(i & 511) && hypercall_preempt_check() )
+ goto out;
}
BUG_ON(p2m->pod.count != 0);
+ out:
unlock_page_alloc(p2m);
+ return p2m->pod.count ? -ERESTART : 0;
}
int
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -815,7 +815,7 @@ int paging_teardown(struct domain *d)
return rc;
/* Move populate-on-demand cache back to domain_list for destruction */
- p2m_pod_empty_cache(d);
+ rc = p2m_pod_empty_cache(d);
return rc;
}
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -588,7 +588,7 @@ void p2m_pod_dump_data(struct domain *d)
/* Move all pages from the populate-on-demand cache to the domain page_list
* (usually in preparation for domain destruction) */
-void p2m_pod_empty_cache(struct domain *d);
+int p2m_pod_empty_cache(struct domain *d);
/* Set populate-on-demand cache size so that the total memory allocated to a
* domain matches target */

View File

@ -0,0 +1,134 @@
# Commit bd2239d9fa975a1ee5bcd27c218ae042cd0a57bc
# Date 2015-11-10 12:03:08 +0100
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/HVM: always intercept #AC and #DB
Both being benign exceptions, and both being possible to get triggered
by exception delivery, this is required to prevent a guest from locking
up a CPU (resulting from no other VM exits occurring once getting into
such a loop).
The specific scenarios:
1) #AC may be raised during exception delivery if the handler is set to
be a ring-3 one by a 32-bit guest, and the stack is misaligned.
This is CVE-2015-5307 / XSA-156.
Reported-by: Benjamin Serebrin <serebrin@google.com>
2) #DB may be raised during exception delivery when a breakpoint got
placed on a data structure involved in delivering the exception. This
can result in an endless loop when a 64-bit guest uses a non-zero IST
for the vector 1 IDT entry, but even without use of IST the time it
takes until a contributory fault would get raised (results depending
on the handler) may be quite long.
This is CVE-2015-8104 / XSA-156.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Tested-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -1043,10 +1043,11 @@ static void noreturn svm_do_resume(struc
unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
{
uint32_t intercepts = vmcb_get_exception_intercepts(vmcb);
- uint32_t mask = (1U << TRAP_debug) | (1U << TRAP_int3);
+
v->arch.hvm_vcpu.debug_state_latch = debug_state;
vmcb_set_exception_intercepts(
- vmcb, debug_state ? (intercepts | mask) : (intercepts & ~mask));
+ vmcb, debug_state ? (intercepts | (1U << TRAP_int3))
+ : (intercepts & ~(1U << TRAP_int3)));
}
if ( v->arch.hvm_svm.launch_core != smp_processor_id() )
@@ -2434,8 +2435,9 @@ void svm_vmexit_handler(struct cpu_user_
case VMEXIT_EXCEPTION_DB:
if ( !v->domain->debugger_attached )
- goto unexpected_exit_type;
- domain_pause_for_debugger();
+ hvm_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE);
+ else
+ domain_pause_for_debugger();
break;
case VMEXIT_EXCEPTION_BP:
@@ -2483,6 +2485,11 @@ void svm_vmexit_handler(struct cpu_user_
break;
}
+ case VMEXIT_EXCEPTION_AC:
+ HVMTRACE_1D(TRAP, TRAP_alignment_check);
+ hvm_inject_hw_exception(TRAP_alignment_check, vmcb->exitinfo1);
+ break;
+
case VMEXIT_EXCEPTION_UD:
svm_vmexit_ud_intercept(regs);
break;
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -1224,16 +1224,10 @@ static void vmx_update_host_cr3(struct v
void vmx_update_debug_state(struct vcpu *v)
{
- unsigned long mask;
-
- mask = 1u << TRAP_int3;
- if ( !cpu_has_monitor_trap_flag )
- mask |= 1u << TRAP_debug;
-
if ( v->arch.hvm_vcpu.debug_state_latch )
- v->arch.hvm_vmx.exception_bitmap |= mask;
+ v->arch.hvm_vmx.exception_bitmap |= 1U << TRAP_int3;
else
- v->arch.hvm_vmx.exception_bitmap &= ~mask;
+ v->arch.hvm_vmx.exception_bitmap &= ~(1U << TRAP_int3);
vmx_vmcs_enter(v);
vmx_update_exception_bitmap(v);
@@ -3041,9 +3035,10 @@ void vmx_vmexit_handler(struct cpu_user_
__vmread(EXIT_QUALIFICATION, &exit_qualification);
HVMTRACE_1D(TRAP_DEBUG, exit_qualification);
write_debugreg(6, exit_qualification | DR_STATUS_RESERVED_ONE);
- if ( !v->domain->debugger_attached || cpu_has_monitor_trap_flag )
- goto exit_and_crash;
- domain_pause_for_debugger();
+ if ( !v->domain->debugger_attached )
+ hvm_inject_hw_exception(vector, HVM_DELIVER_NO_ERROR_CODE);
+ else
+ domain_pause_for_debugger();
break;
case TRAP_int3:
{
@@ -3108,6 +3103,11 @@ void vmx_vmexit_handler(struct cpu_user_
hvm_inject_page_fault(regs->error_code, exit_qualification);
break;
+ case TRAP_alignment_check:
+ HVMTRACE_1D(TRAP, vector);
+ __vmread(VM_EXIT_INTR_ERROR_CODE, &ecode);
+ hvm_inject_hw_exception(vector, ecode);
+ break;
case TRAP_nmi:
if ( MASK_EXTR(intr_info, INTR_INFO_INTR_TYPE_MASK) !=
X86_EVENTTYPE_NMI )
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -384,7 +384,10 @@ static inline int hvm_event_pending(stru
(X86_CR4_VMXE | X86_CR4_PAE | X86_CR4_MCE))
/* These exceptions must always be intercepted. */
-#define HVM_TRAP_MASK ((1U << TRAP_machine_check) | (1U << TRAP_invalid_op))
+#define HVM_TRAP_MASK ((1U << TRAP_debug) | \
+ (1U << TRAP_invalid_op) | \
+ (1U << TRAP_alignment_check) | \
+ (1U << TRAP_machine_check))
/*
* x86 event types. This enumeration is valid for:

View File

@ -0,0 +1,20 @@
# Commit 057e0e72d2a5d598087c5f167ec6a13203a3cf65
# Date 2015-11-12 16:59:18 +0100
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/HVM: don't inject #DB with error code
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper@citrix.com>
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -4071,7 +4071,7 @@ void hvm_task_switch(
goto out;
if ( (tss.trace & 1) && !exn_raised )
- hvm_inject_hw_exception(TRAP_debug, tss_sel & 0xfff8);
+ hvm_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE);
tr.attr.fields.type = 0xb; /* busy 32-bit tss */
hvm_set_segment_register(v, x86_seg_tr, &tr);

View File

@ -21,11 +21,11 @@ Cc: qemu-stable@nongnu.org
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Benoit Canet <benoit@irqsave.net>
Index: xen-4.5.1-testing/tools/qemu-xen-dir-remote/block/qcow.c
Index: xen-4.6.0-testing/tools/qemu-xen-dir-remote/block/qcow.c
===================================================================
--- xen-4.5.1-testing.orig/tools/qemu-xen-dir-remote/block/qcow.c
+++ xen-4.5.1-testing/tools/qemu-xen-dir-remote/block/qcow.c
@@ -147,6 +147,14 @@ static int qcow_open(BlockDriverState *b
--- xen-4.6.0-testing.orig/tools/qemu-xen-dir-remote/block/qcow.c
+++ xen-4.6.0-testing/tools/qemu-xen-dir-remote/block/qcow.c
@@ -148,6 +148,14 @@ static int qcow_open(BlockDriverState *b
goto fail;
}

View File

@ -1,216 +0,0 @@
xl: Sane handling of extra config file arguments
Various xl sub-commands take additional parameters containing = as
additional config fragments.
The handling of these config fragments has a number of bugs:
1. Use of a static 1024-byte buffer. (If truncation would occur,
with semi-trusted input, a security risk arises due to quotes
being lost.)
2. Mishandling of the return value from snprintf, so that if
truncation occurs, the to-write pointer is updated with the
wanted-to-write length, resulting in stack corruption. (This is
XSA-137.)
3. Clone-and-hack of the code for constructing the appended
config file.
These are fixed here, by introducing a new function
`string_realloc_append' and using it everywhere. The `extra_info'
buffers are replaced by pointers, which start off NULL and are
explicitly freed on all return paths.
The separate variable which will become dom_info.extra_config is
abolished (which involves moving the clearing of dom_info).
Additional bugs I observe, not fixed here:
4. The functions which now call string_realloc_append use ad-hoc
error returns, with multiple calls to `return'. This currently
necessitates multiple new calls to `free'.
5. Many of the paths in xl call exit(-rc) where rc is a libxl status
code. This is a ridiculous exit status `convention'.
6. The loops for handling extra config data are clone-and-hacks.
7. Once the extra config buffer is accumulated, it must be combined
with the appropriate main config file. The code to do this
combining is clone-and-hacked too.
Signed-off-by: Ian Jackson <Ian.Jackson@eu.citrix.com>
Tested-by: Ian Jackson <Ian.Jackson@eu.citrix.com>
Acked-by: Ian Campbell <ian,campbell@citrix.com>
--- a/tools/libxl/xl_cmdimpl.c
+++ b/tools/libxl/xl_cmdimpl.c
@@ -151,7 +151,7 @@ struct domain_create {
int console_autoconnect;
int checkpointed_stream;
const char *config_file;
- const char *extra_config; /* extra config string */
+ char *extra_config; /* extra config string */
const char *restore_file;
int migrate_fd; /* -1 means none */
char **migration_domname_r; /* from malloc */
@@ -4572,11 +4572,25 @@ int main_vm_list(int argc, char **argv)
return 0;
}
+static void string_realloc_append(char **accumulate, const char *more)
+{
+ /* Appends more to accumulate. Accumulate is either NULL, or
+ * points (always) to a malloc'd nul-terminated string. */
+
+ size_t oldlen = *accumulate ? strlen(*accumulate) : 0;
+ size_t morelen = strlen(more) + 1/*nul*/;
+ if (oldlen > SSIZE_MAX || morelen > SSIZE_MAX - oldlen) {
+ fprintf(stderr,"Additional config data far too large\n");
+ exit(-ERROR_FAIL);
+ }
+
+ *accumulate = xrealloc(*accumulate, oldlen + morelen);
+ memcpy(*accumulate + oldlen, more, morelen);
+}
+
int main_create(int argc, char **argv)
{
const char *filename = NULL;
- char *p;
- char extra_config[1024];
struct domain_create dom_info;
int paused = 0, debug = 0, daemonize = 1, console_autoconnect = 0,
quiet = 0, monitor = 1, vnc = 0, vncautopass = 0;
@@ -4591,6 +4605,8 @@ int main_create(int argc, char **argv)
{0, 0, 0, 0}
};
+ dom_info.extra_config = NULL;
+
if (argv[1] && argv[1][0] != '-' && !strchr(argv[1], '=')) {
filename = argv[1];
argc--; argv++;
@@ -4630,20 +4646,21 @@ int main_create(int argc, char **argv)
break;
}
- extra_config[0] = '\0';
- for (p = extra_config; optind < argc; optind++) {
+ memset(&dom_info, 0, sizeof(dom_info));
+
+ for (; optind < argc; optind++) {
if (strchr(argv[optind], '=') != NULL) {
- p += snprintf(p, sizeof(extra_config) - (p - extra_config),
- "%s\n", argv[optind]);
+ string_realloc_append(&dom_info.extra_config, argv[optind]);
+ string_realloc_append(&dom_info.extra_config, "\n");
} else if (!filename) {
filename = argv[optind];
} else {
help("create");
+ free(dom_info.extra_config);
return 2;
}
}
- memset(&dom_info, 0, sizeof(dom_info));
dom_info.debug = debug;
dom_info.daemonize = daemonize;
dom_info.monitor = monitor;
@@ -4651,16 +4668,18 @@ int main_create(int argc, char **argv)
dom_info.dryrun = dryrun_only;
dom_info.quiet = quiet;
dom_info.config_file = filename;
- dom_info.extra_config = extra_config;
dom_info.migrate_fd = -1;
dom_info.vnc = vnc;
dom_info.vncautopass = vncautopass;
dom_info.console_autoconnect = console_autoconnect;
rc = create_domain(&dom_info);
- if (rc < 0)
+ if (rc < 0) {
+ free(dom_info.extra_config);
return -rc;
+ }
+ free(dom_info.extra_config);
return 0;
}
@@ -4668,8 +4687,7 @@ int main_config_update(int argc, char **
{
uint32_t domid;
const char *filename = NULL;
- char *p;
- char extra_config[1024];
+ char *extra_config = NULL;
void *config_data = 0;
int config_len = 0;
libxl_domain_config d_config;
@@ -4707,15 +4725,15 @@ int main_config_update(int argc, char **
break;
}
- extra_config[0] = '\0';
- for (p = extra_config; optind < argc; optind++) {
+ for (; optind < argc; optind++) {
if (strchr(argv[optind], '=') != NULL) {
- p += snprintf(p, sizeof(extra_config) - (p - extra_config),
- "%s\n", argv[optind]);
+ string_realloc_append(&extra_config, argv[optind]);
+ string_realloc_append(&extra_config, "\n");
} else if (!filename) {
filename = argv[optind];
} else {
help("create");
+ free(extra_config);
return 2;
}
}
@@ -4724,7 +4742,8 @@ int main_config_update(int argc, char **
rc = libxl_read_file_contents(ctx, filename,
&config_data, &config_len);
if (rc) { fprintf(stderr, "Failed to read config file: %s: %s\n",
- filename, strerror(errno)); return ERROR_FAIL; }
+ filename, strerror(errno));
+ free(extra_config); return ERROR_FAIL; }
if (strlen(extra_config)) {
if (config_len > INT_MAX - (strlen(extra_config) + 2 + 1)) {
fprintf(stderr, "Failed to attach extra configration\n");
@@ -4765,7 +4784,7 @@ int main_config_update(int argc, char **
libxl_domain_config_dispose(&d_config);
free(config_data);
-
+ free(extra_config);
return 0;
}
@@ -7022,7 +7041,7 @@ int main_cpupoolcreate(int argc, char **
{
const char *filename = NULL, *config_src=NULL;
const char *p;
- char extra_config[1024];
+ char *extra_config = NULL;
int opt;
static struct option opts[] = {
{"defconfig", 1, 0, 'f'},
@@ -7056,13 +7075,10 @@ int main_cpupoolcreate(int argc, char **
break;
}
- memset(extra_config, 0, sizeof(extra_config));
while (optind < argc) {
if ((p = strchr(argv[optind], '='))) {
- if (strlen(extra_config) + 1 + strlen(argv[optind]) < sizeof(extra_config)) {
- strcat(extra_config, "\n");
- strcat(extra_config, argv[optind]);
- }
+ string_realloc_append(&extra_config, "\n");
+ string_realloc_append(&extra_config, argv[optind]);
} else if (!filename) {
filename = argv[optind];
} else {

View File

@ -1,37 +0,0 @@
tools: libxl: allow permissive qemu-upstream pci passthrough
Since XSA-131 qemu-xen now restricts access to PCI cfg by default. In
order to allow local configuration of the existing libxl_device_pci
"permissive" flag needs to be plumbed through via the new QMP property
added by the XSA-131 patches.
Versions of QEMU prior to XSA-131 did not support this permissive
property, so we only pass it if it is true. Older versions only
supported permissive mode.
qemu-xen-traditional already supports the permissive mode setting via
xenstore.
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
--- a/tools/libxl/libxl_qmp.c
+++ b/tools/libxl/libxl_qmp.c
@@ -835,6 +835,18 @@ int libxl__qmp_pci_add(libxl__gc *gc, in
QMP_PARAMETERS_SPRINTF(&args, "addr", "%x.%x",
PCI_SLOT(pcidev->vdevfn), PCI_FUNC(pcidev->vdevfn));
}
+ /*
+ * Version of QEMU prior to the XSA-131 fix did not support this
+ * property and were effectively always in permissive mode. The
+ * fix for XSA-131 switched the default to be restricted by
+ * default and added the permissive property.
+ *
+ * Therefore in order to support both old and new QEMU we only set
+ * the permissive flag if it is true. Users of older QEMU have no
+ * reason to set the flag so this is ok.
+ */
+ if (pcidev->permissive)
+ qmp_parameters_add_bool(gc, &args, "permissive", true);
rc = qmp_synchronous_send(qmp, "device_add", args,
NULL, NULL, qmp->timeout);

View File

@ -1,74 +0,0 @@
From a9de14175548c04e0f8be7fae219246509ba46a9 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 3 Jun 2015 14:13:31 +0200
Subject: [PATCH 1/3] ide: Check array bounds before writing to io_buffer
(CVE-2015-5154)
If the end_transfer_func of a command is called because enough data has
been read or written for the current PIO transfer, and it fails to
correctly call the command completion functions, the DRQ bit in the
status register and s->end_transfer_func may remain set. This allows the
guest to access further bytes in s->io_buffer beyond s->data_end, and
eventually overflowing the io_buffer.
One case where this currently happens is emulation of the ATAPI command
START STOP UNIT.
This patch fixes the problem by adding explicit array bounds checks
before accessing the buffer instead of relying on end_transfer_func to
function correctly.
Cc: qemu-stable@nongnu.org
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
hw/ide/core.c | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
Index: xen-4.2.5-testing/tools/qemu-xen-traditional-dir-remote/hw/ide.c
===================================================================
--- xen-4.2.5-testing.orig/tools/qemu-xen-traditional-dir-remote/hw/ide.c
+++ xen-4.2.5-testing/tools/qemu-xen-traditional-dir-remote/hw/ide.c
@@ -3002,6 +3002,10 @@ static void ide_data_writew(void *opaque
buffered_pio_write(s, addr, 2);
p = s->data_ptr;
+ if (p + 2 > s->data_end) {
+ return;
+ }
+
*(uint16_t *)p = le16_to_cpu(val);
p += 2;
s->data_ptr = p;
@@ -3021,6 +3025,10 @@ static uint32_t ide_data_readw(void *opa
buffered_pio_read(s, addr, 2);
p = s->data_ptr;
+ if (p + 2 > s->data_end) {
+ return 0;
+ }
+
ret = cpu_to_le16(*(uint16_t *)p);
p += 2;
s->data_ptr = p;
@@ -3040,6 +3048,10 @@ static void ide_data_writel(void *opaque
buffered_pio_write(s, addr, 4);
p = s->data_ptr;
+ if (p + 4 > s->data_end) {
+ return;
+ }
+
*(uint32_t *)p = le32_to_cpu(val);
p += 4;
s->data_ptr = p;
@@ -3059,6 +3071,10 @@ static uint32_t ide_data_readl(void *opa
buffered_pio_read(s, addr, 4);
p = s->data_ptr;
+ if (p + 4 > s->data_end) {
+ return 0;
+ }
+
ret = cpu_to_le32(*(uint32_t *)p);
p += 4;
s->data_ptr = p;

View File

@ -1,68 +0,0 @@
From 1d3c2268f8708126a34064c2e0c1000b40e6f3e5 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 3 Jun 2015 14:41:27 +0200
Subject: [PATCH 3/3] ide: Clear DRQ after handling all expected accesses
This is additional hardening against an end_transfer_func that fails to
clear the DRQ status bit. The bit must be unset as soon as the PIO
transfer has completed, so it's better to do this in a central place
instead of duplicating the code in all commands (and forgetting it in
some).
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
hw/ide/core.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
Index: xen-4.2.5-testing/tools/qemu-xen-traditional-dir-remote/hw/ide.c
===================================================================
--- xen-4.2.5-testing.orig/tools/qemu-xen-traditional-dir-remote/hw/ide.c
+++ xen-4.2.5-testing/tools/qemu-xen-traditional-dir-remote/hw/ide.c
@@ -3016,8 +3016,10 @@ static void ide_data_writew(void *opaque
*(uint16_t *)p = le16_to_cpu(val);
p += 2;
s->data_ptr = p;
- if (p >= s->data_end)
+ if (p >= s->data_end) {
+ s->status &= ~DRQ_STAT;
s->end_transfer_func(s);
+ }
}
static uint32_t ide_data_readw(void *opaque, uint32_t addr)
@@ -3039,8 +3041,10 @@ static uint32_t ide_data_readw(void *opa
ret = cpu_to_le16(*(uint16_t *)p);
p += 2;
s->data_ptr = p;
- if (p >= s->data_end)
+ if (p >= s->data_end) {
+ s->status &= ~DRQ_STAT;
s->end_transfer_func(s);
+ }
return ret;
}
@@ -3062,8 +3066,10 @@ static void ide_data_writel(void *opaque
*(uint32_t *)p = le32_to_cpu(val);
p += 4;
s->data_ptr = p;
- if (p >= s->data_end)
+ if (p >= s->data_end) {
+ s->status &= ~DRQ_STAT;
s->end_transfer_func(s);
+ }
}
static uint32_t ide_data_readl(void *opaque, uint32_t addr)
@@ -3085,8 +3091,10 @@ static uint32_t ide_data_readl(void *opa
ret = cpu_to_le32(*(uint32_t *)p);
p += 4;
s->data_ptr = p;
- if (p >= s->data_end)
+ if (p >= s->data_end) {
+ s->status &= ~DRQ_STAT;
s->end_transfer_func(s);
+ }
return ret;
}

View File

@ -1,54 +0,0 @@
Subject: ATAPI: STARTSTOPUNIT only eject/load media if powercondition is 0
From: Ronnie Sahlberg ronniesahlberg@gmail.com Tue Jul 31 11:28:26 2012 +1000
Date: Wed Sep 12 15:50:09 2012 +0200:
Git: ce560dcf20c14194db5ef3b9fc1ea592d4e68109
The START STOP UNIT command will only eject/load media if
power condition is zero.
If power condition is !0 then LOEJ and START will be ignored.
From MMC (sbc contains similar wordings too)
The Power Conditions field requests the block device to be placed
in the power condition defined in
Table 558. If this field has a value other than 0h then the Start
and LoEj bits shall be ignored.
Signed-off-by: Ronnie Sahlberg <ronniesahlberg@gmail.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
From aa851d30acfbb9580098ac1dc82885530cb8b3c1 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 3 Jun 2015 14:17:46 +0200
Subject: [PATCH 2/3] ide/atapi: Fix START STOP UNIT command completion
The command must be completed on all code paths. START STOP UNIT with
pwrcnd set should succeed without doing anything.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
hw/ide/atapi.c | 1 +
1 file changed, 1 insertion(+)
Index: xen-4.2.5-testing/tools/qemu-xen-traditional-dir-remote/hw/ide.c
===================================================================
--- xen-4.2.5-testing.orig/tools/qemu-xen-traditional-dir-remote/hw/ide.c
+++ xen-4.2.5-testing/tools/qemu-xen-traditional-dir-remote/hw/ide.c
@@ -2095,9 +2095,16 @@ static void ide_atapi_cmd(IDEState *s)
break;
case GPCMD_START_STOP_UNIT:
{
- int start, eject;
+ int start, eject, pwrcnd;
start = packet[4] & 1;
eject = (packet[4] >> 1) & 1;
+ pwrcnd = buf[4] & 0xf0;
+
+ if (pwrcnd) {
+ /* eject/load only happens for power condition == 0 */
+ ide_atapi_cmd_ok(s);
+ return;
+ }
if (eject && !start) {
/* eject the disk */

View File

@ -1,74 +0,0 @@
From a9de14175548c04e0f8be7fae219246509ba46a9 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 3 Jun 2015 14:13:31 +0200
Subject: [PATCH 1/3] ide: Check array bounds before writing to io_buffer
(CVE-2015-5154)
If the end_transfer_func of a command is called because enough data has
been read or written for the current PIO transfer, and it fails to
correctly call the command completion functions, the DRQ bit in the
status register and s->end_transfer_func may remain set. This allows the
guest to access further bytes in s->io_buffer beyond s->data_end, and
eventually overflowing the io_buffer.
One case where this currently happens is emulation of the ATAPI command
START STOP UNIT.
This patch fixes the problem by adding explicit array bounds checks
before accessing the buffer instead of relying on end_transfer_func to
function correctly.
Cc: qemu-stable@nongnu.org
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
hw/ide/core.c | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
Index: xen-4.5.1-testing/tools/qemu-xen-dir-remote/hw/ide/core.c
===================================================================
--- xen-4.5.1-testing.orig/tools/qemu-xen-dir-remote/hw/ide/core.c
+++ xen-4.5.1-testing/tools/qemu-xen-dir-remote/hw/ide/core.c
@@ -1901,6 +1901,10 @@ void ide_data_writew(void *opaque, uint3
}
p = s->data_ptr;
+ if (p + 2 > s->data_end) {
+ return;
+ }
+
*(uint16_t *)p = le16_to_cpu(val);
p += 2;
s->data_ptr = p;
@@ -1922,6 +1926,10 @@ uint32_t ide_data_readw(void *opaque, ui
}
p = s->data_ptr;
+ if (p + 2 > s->data_end) {
+ return 0;
+ }
+
ret = cpu_to_le16(*(uint16_t *)p);
p += 2;
s->data_ptr = p;
@@ -1943,6 +1951,10 @@ void ide_data_writel(void *opaque, uint3
}
p = s->data_ptr;
+ if (p + 4 > s->data_end) {
+ return;
+ }
+
*(uint32_t *)p = le32_to_cpu(val);
p += 4;
s->data_ptr = p;
@@ -1964,6 +1976,10 @@ uint32_t ide_data_readl(void *opaque, ui
}
p = s->data_ptr;
+ if (p + 4 > s->data_end) {
+ return 0;
+ }
+
ret = cpu_to_le32(*(uint32_t *)p);
p += 4;
s->data_ptr = p;

View File

@ -1,68 +0,0 @@
From 1d3c2268f8708126a34064c2e0c1000b40e6f3e5 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 3 Jun 2015 14:41:27 +0200
Subject: [PATCH 3/3] ide: Clear DRQ after handling all expected accesses
This is additional hardening against an end_transfer_func that fails to
clear the DRQ status bit. The bit must be unset as soon as the PIO
transfer has completed, so it's better to do this in a central place
instead of duplicating the code in all commands (and forgetting it in
some).
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
hw/ide/core.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
Index: xen-4.5.1-testing/tools/qemu-xen-dir-remote/hw/ide/core.c
===================================================================
--- xen-4.5.1-testing.orig/tools/qemu-xen-dir-remote/hw/ide/core.c
+++ xen-4.5.1-testing/tools/qemu-xen-dir-remote/hw/ide/core.c
@@ -1908,8 +1908,10 @@ void ide_data_writew(void *opaque, uint3
*(uint16_t *)p = le16_to_cpu(val);
p += 2;
s->data_ptr = p;
- if (p >= s->data_end)
+ if (p >= s->data_end) {
+ s->status &= ~DRQ_STAT;
s->end_transfer_func(s);
+ }
}
uint32_t ide_data_readw(void *opaque, uint32_t addr)
@@ -1933,8 +1935,10 @@ uint32_t ide_data_readw(void *opaque, ui
ret = cpu_to_le16(*(uint16_t *)p);
p += 2;
s->data_ptr = p;
- if (p >= s->data_end)
+ if (p >= s->data_end) {
+ s->status &= ~DRQ_STAT;
s->end_transfer_func(s);
+ }
return ret;
}
@@ -1958,8 +1962,10 @@ void ide_data_writel(void *opaque, uint3
*(uint32_t *)p = le32_to_cpu(val);
p += 4;
s->data_ptr = p;
- if (p >= s->data_end)
+ if (p >= s->data_end) {
+ s->status &= ~DRQ_STAT;
s->end_transfer_func(s);
+ }
}
uint32_t ide_data_readl(void *opaque, uint32_t addr)
@@ -1983,8 +1989,10 @@ uint32_t ide_data_readl(void *opaque, ui
ret = cpu_to_le32(*(uint32_t *)p);
p += 4;
s->data_ptr = p;
- if (p >= s->data_end)
+ if (p >= s->data_end) {
+ s->status &= ~DRQ_STAT;
s->end_transfer_func(s);
+ }
return ret;
}

View File

@ -1,25 +0,0 @@
From aa851d30acfbb9580098ac1dc82885530cb8b3c1 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 3 Jun 2015 14:17:46 +0200
Subject: [PATCH 2/3] ide/atapi: Fix START STOP UNIT command completion
The command must be completed on all code paths. START STOP UNIT with
pwrcnd set should succeed without doing anything.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
hw/ide/atapi.c | 1 +
1 file changed, 1 insertion(+)
Index: xen-4.5.1-testing/tools/qemu-xen-dir-remote/hw/ide/atapi.c
===================================================================
--- xen-4.5.1-testing.orig/tools/qemu-xen-dir-remote/hw/ide/atapi.c
+++ xen-4.5.1-testing/tools/qemu-xen-dir-remote/hw/ide/atapi.c
@@ -879,6 +879,7 @@ static void cmd_start_stop_unit(IDEState
if (pwrcnd) {
/* eject/load only happens for power condition == 0 */
+ ide_atapi_cmd_ok(s);
return;
}

View File

@ -1,50 +0,0 @@
References: bsc#944463
Subject: ui/vnc: limit client_cut_text msg payload size
From: Peter Lieven pl@kamp.de Mon Jun 30 10:07:54 2014 +0200
Date: Tue Jul 1 13:26:40 2014 +0200:
Git: f9a70e79391f6d7c2a912d785239ee8effc1922d
currently a malicious client could define a payload
size of 2^32 - 1 bytes and send up to that size of
data to the vnc server. The server would allocated
that amount of memory which could easily create an
out of memory condition.
This patch limits the payload size to 1MB max.
Please note that client_cut_text messages are currently
silently ignored.
Signed-off-by: Peter Lieven <pl@kamp.de>
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Index: xen-4.5.1-testing/tools/qemu-xen-traditional-dir-remote/vnc.c
===================================================================
--- xen-4.5.1-testing.orig/tools/qemu-xen-traditional-dir-remote/vnc.c
+++ xen-4.5.1-testing/tools/qemu-xen-traditional-dir-remote/vnc.c
@@ -1779,14 +1779,21 @@ static int protocol_client_msg(VncState
pointer_event(vs, read_u8(data, 1), read_u16(data, 2), read_u16(data, 4));
break;
case 6:
- if (len == 1)
+ if (len == 1) {
return 8;
-
+ }
if (len == 8) {
uint32_t v;
v = read_u32(data, 4);
- if (v)
+ if (v > (1 << 20)) {
+ VNC_DEBUG("vnc: client_cut_text msg payload has %u bytes"
+ " which exceeds our limit of 1MB.", v);
+ vnc_client_error(vs);
+ break;
+ }
+ if (v > 0) {
return 8 + v;
+ }
}
client_cut_text(vs, read_u32(data, 4), (char *)(data + 8));

View File

@ -1,49 +0,0 @@
References: bsc#944463
Subject: ui/vnc: limit client_cut_text msg payload size
From: Peter Lieven pl@kamp.de Mon Jun 30 10:07:54 2014 +0200
Date: Tue Jul 1 13:26:40 2014 +0200:
Git: f9a70e79391f6d7c2a912d785239ee8effc1922d
currently a malicious client could define a payload
size of 2^32 - 1 bytes and send up to that size of
data to the vnc server. The server would allocated
that amount of memory which could easily create an
out of memory condition.
This patch limits the payload size to 1MB max.
Please note that client_cut_text messages are currently
silently ignored.
Signed-off-by: Peter Lieven <pl@kamp.de>
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Index: xen-4.5.1-testing/tools/qemu-xen-dir-remote/ui/vnc.c
===================================================================
--- xen-4.5.1-testing.orig/tools/qemu-xen-dir-remote/ui/vnc.c
+++ xen-4.5.1-testing/tools/qemu-xen-dir-remote/ui/vnc.c
@@ -2149,13 +2149,20 @@ static int protocol_client_msg(VncState
pointer_event(vs, read_u8(data, 1), read_u16(data, 2), read_u16(data, 4));
break;
case VNC_MSG_CLIENT_CUT_TEXT:
- if (len == 1)
+ if (len == 1) {
return 8;
-
+ }
if (len == 8) {
uint32_t dlen = read_u32(data, 4);
- if (dlen > 0)
+ if (dlen > (1 << 20)) {
+ error_report("vnc: client_cut_text msg payload has %u bytes"
+ " which exceeds our limit of 1MB.", dlen);
+ vnc_client_error(vs);
+ break;
+ }
+ if (dlen > 0) {
return 8 + dlen;
+ }
}
client_cut_text(vs, read_u32(data, 4), data + 8);

View File

@ -1,31 +0,0 @@
References: bsc#944697
From: P J P <address@hidden>
While processing transmit descriptors, it could lead to an infinite
loop if 'bytes' was to become zero; Add a check to avoid it.
[The guest can force 'bytes' to 0 by setting the hdr_len and mss
descriptor fields to 0.
--Stefan]
Signed-off-by: P J P <address@hidden>
Signed-off-by: Stefan Hajnoczi <address@hidden>
---
hw/net/e1000.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
Index: xen-4.5.1-testing/tools/qemu-xen-traditional-dir-remote/hw/e1000.c
===================================================================
--- xen-4.5.1-testing.orig/tools/qemu-xen-traditional-dir-remote/hw/e1000.c
+++ xen-4.5.1-testing/tools/qemu-xen-traditional-dir-remote/hw/e1000.c
@@ -470,7 +470,8 @@ process_tx_desc(E1000State *s, struct e1
memmove(tp->data, tp->header, hdr);
tp->size = hdr;
}
- } while (split_size -= bytes);
+ split_size -= bytes;
+ } while (bytes && split_size);
} else if (!tp->tse && tp->cptse) {
// context descriptor TSE is not set, while data descriptor TSE is set
DBGOUT(TXERR, "TCP segmentaion Error\n");

Some files were not shown because too many files have changed in this diff Show More