xen/32on64-call-gates.patch

485 lines
17 KiB
Diff

Index: 2007-05-14/xen/arch/x86/traps.c
===================================================================
--- 2007-05-14.orig/xen/arch/x86/traps.c 2007-05-14 14:39:42.000000000 +0200
+++ 2007-05-14/xen/arch/x86/traps.c 2007-05-14 14:40:03.000000000 +0200
@@ -1091,6 +1091,63 @@ static int read_descriptor(unsigned int
return 1;
}
+#ifdef CONFIG_COMPAT/* XXX __x86_64__ */
+static int read_gate_descriptor(unsigned int gate_sel,
+ const struct vcpu *v,
+ unsigned int *sel,
+ unsigned long *off,
+ unsigned int *ar)
+{
+ struct desc_struct desc;
+ const struct desc_struct *pdesc;
+
+
+ pdesc = (const struct desc_struct *)(!(gate_sel & 4) ?
+ GDT_VIRT_START(v) :
+ LDT_VIRT_START(v))
+ + (gate_sel >> 3);
+ if ( gate_sel < 4 ||
+ (gate_sel >= FIRST_RESERVED_GDT_BYTE && !(gate_sel & 4)) ||
+ __get_user(desc, pdesc) )
+ return 0;
+
+ *sel = (desc.a >> 16) & 0x0000fffc;
+ *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
+ *ar = desc.b & 0x0000ffff;
+ /*
+ * check_descriptor() clears the DPL field and stores the
+ * guest requested DPL in the selector's RPL field.
+ */
+ ASSERT(!(*ar & _SEGMENT_DPL));
+ *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
+
+ if ( !is_pv_32on64_vcpu(v) )
+ {
+ if ( (*ar & 0x1f00) != 0x0c00 ||
+ (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
+ __get_user(desc, pdesc + 1) ||
+ (desc.b & 0x1f00) )
+ return 0;
+
+ *off |= (unsigned long)desc.a << 32;
+ return 1;
+ }
+
+ switch ( *ar & 0x1f00 )
+ {
+ case 0x0400:
+ *off &= 0xffff;
+ break;
+ case 0x0c00:
+ break;
+ default:
+ return 0;
+ }
+
+ return 1;
+}
+#endif
+
/* Has the guest requested sufficient permission for this I/O access? */
static inline int guest_io_okay(
unsigned int port, unsigned int bytes,
@@ -1158,6 +1215,8 @@ unsigned long guest_to_host_gpr_switch(u
#define insn_fetch(type, base, eip, limit) \
({ unsigned long _rc, _ptr = (base) + (eip); \
type _x; \
+ if ( ad_default < 8 ) \
+ _ptr = (unsigned int)_ptr; \
if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
goto fail; \
if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
@@ -1760,6 +1819,336 @@ static int emulate_privileged_op(struct
return 0;
}
+static inline int check_stack_limit(unsigned int ar, unsigned int limit,
+ unsigned int esp, unsigned int decr)
+{
+ return esp - decr < esp - 1 &&
+ (!(ar & _SEGMENT_EC) ? esp - 1 <= limit : esp - decr > limit);
+}
+
+static int emulate_gate_op(struct cpu_user_regs *regs)
+{
+#ifdef CONFIG_COMPAT/* XXX __x86_64__ */
+ struct vcpu *v = current;
+ unsigned int sel, ar, dpl, nparm, opnd_sel;
+ unsigned int op_default, op_bytes, ad_default, ad_bytes;
+ unsigned long off, eip, opnd_off, base, limit;
+ int jump;
+
+ /* Check whether this fault is due to the use of a call gate. */
+ if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
+ ((ar >> 13) & 3) < (regs->cs & 3) ||
+ (ar & _SEGMENT_TYPE) != 0xc00 )
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+ if ( !(ar & _SEGMENT_P) )
+ return do_guest_trap(TRAP_no_segment, regs, 1);
+ dpl = (ar >> 13) & 3;
+ nparm = ar & 0x1f;
+
+ /*
+ * Decode instruction (and perhaps operand) to determine RPL,
+ * whether this is a jump or a call, and the call return offset.
+ */
+ if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) ||
+ !(ar & _SEGMENT_S) ||
+ !(ar & _SEGMENT_P) ||
+ !(ar & _SEGMENT_CODE) )
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+
+ op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
+ ad_default = ad_bytes = op_default;
+ opnd_sel = opnd_off = 0;
+ jump = -1;
+ for ( eip = regs->eip; eip - regs->_eip < 10; )
+ {
+ switch ( insn_fetch(u8, base, eip, limit) )
+ {
+ case 0x66: /* operand-size override */
+ op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
+ continue;
+ case 0x67: /* address-size override */
+ ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
+ continue;
+ case 0x2e: /* CS override */
+ opnd_sel = regs->cs;
+ ASSERT(opnd_sel);
+ continue;
+ case 0x3e: /* DS override */
+ opnd_sel = read_sreg(regs, ds);
+ if ( !opnd_sel )
+ opnd_sel = dpl;
+ continue;
+ case 0x26: /* ES override */
+ opnd_sel = read_sreg(regs, es);
+ if ( !opnd_sel )
+ opnd_sel = dpl;
+ continue;
+ case 0x64: /* FS override */
+ opnd_sel = read_sreg(regs, fs);
+ if ( !opnd_sel )
+ opnd_sel = dpl;
+ continue;
+ case 0x65: /* GS override */
+ opnd_sel = read_sreg(regs, gs);
+ if ( !opnd_sel )
+ opnd_sel = dpl;
+ continue;
+ case 0x36: /* SS override */
+ opnd_sel = regs->ss;
+ if ( !opnd_sel )
+ opnd_sel = dpl;
+ continue;
+ case 0xea:
+ ++jump;
+ /* FALLTHROUGH */
+ case 0x9a:
+ ++jump;
+ opnd_sel = regs->cs;
+ opnd_off = eip;
+ ad_bytes = ad_default;
+ eip += op_bytes + 2;
+ break;
+ case 0xff:
+ {
+ unsigned int modrm;
+
+ switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
+ {
+ case 0x28: case 0x68: case 0xa8:
+ ++jump;
+ /* FALLTHROUGH */
+ case 0x18: case 0x58: case 0x98:
+ ++jump;
+ if ( ad_bytes != 2 )
+ {
+ if ( (modrm & 7) == 4 )
+ {
+ unsigned int sib = insn_fetch(u8, base, eip, limit);
+
+ modrm = (modrm & ~7) | (sib & 7);
+ if ( (sib >>= 3) != 4 )
+ opnd_off = *(unsigned long *)decode_register(sib & 7, regs, 0);
+ opnd_off <<= sib >> 3;
+ }
+ if ( (modrm & 7) != 5 || (modrm & 0xc0) )
+ opnd_off += *(unsigned long *)decode_register(modrm & 7, regs, 0);
+ else
+ modrm |= 0x87;
+ if ( !opnd_sel )
+ {
+ switch ( modrm & 7 )
+ {
+ default:
+ opnd_sel = read_sreg(regs, ds);
+ break;
+ case 4: case 5:
+ opnd_sel = regs->ss;
+ break;
+ }
+ }
+ }
+ else
+ {
+ switch ( modrm & 7 )
+ {
+ case 0: case 1: case 7:
+ opnd_off = regs->ebx;
+ break;
+ case 6:
+ if ( !(modrm & 0xc0) )
+ modrm |= 0x80;
+ else
+ case 2: case 3:
+ {
+ opnd_off = regs->ebp;
+ if ( !opnd_sel )
+ opnd_sel = regs->ss;
+ }
+ break;
+ }
+ if ( !opnd_sel )
+ opnd_sel = read_sreg(regs, ds);
+ switch ( modrm & 7 )
+ {
+ case 0: case 2: case 4:
+ opnd_off += regs->esi;
+ break;
+ case 1: case 3: case 5:
+ opnd_off += regs->edi;
+ break;
+ }
+ }
+ switch ( modrm & 0xc0 )
+ {
+ case 0x40:
+ opnd_off += insn_fetch(s8, base, eip, limit);
+ break;
+ case 0x80:
+ opnd_off += insn_fetch(s32, base, eip, limit);
+ break;
+ }
+ if ( ad_bytes == 4 )
+ opnd_off = (unsigned int)opnd_off;
+ else if ( ad_bytes == 2 )
+ opnd_off = (unsigned short)opnd_off;
+ break;
+ }
+ }
+ break;
+ }
+ break;
+ }
+
+ if ( jump < 0 )
+ {
+ fail:
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+ }
+
+ if ( (opnd_sel != regs->cs &&
+ !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) ||
+ !(ar & _SEGMENT_S) ||
+ !(ar & _SEGMENT_P) ||
+ ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+
+ opnd_off += op_bytes;
+#define ad_default ad_bytes
+ opnd_sel = insn_fetch(u16, base, opnd_off, limit);
+#undef ad_default
+ ASSERT((opnd_sel & ~3) == regs->error_code);
+ if ( dpl < (opnd_sel & 3) )
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+
+ if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) ||
+ !(ar & _SEGMENT_S) ||
+ !(ar & _SEGMENT_CODE) ||
+ (!jump || (ar & _SEGMENT_EC) ?
+ ((ar >> 13) & 3) > (regs->cs & 3) :
+ ((ar >> 13) & 3) != (regs->cs & 3)) )
+ {
+ regs->error_code = sel;
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+ }
+ if ( !(ar & _SEGMENT_P) )
+ {
+ regs->error_code = sel;
+ return do_guest_trap(TRAP_no_segment, regs, 1);
+ }
+ if ( off > limit )
+ {
+ regs->error_code = 0;
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+ }
+
+ if ( !jump )
+ {
+ unsigned int ss, esp, *stkp;
+ int rc;
+#define push(item) do \
+ { \
+ --stkp; \
+ esp -= 4; \
+ rc = __put_user(item, stkp); \
+ if ( rc ) \
+ { \
+ propagate_page_fault((unsigned long)(stkp + 1) - rc, \
+ PFEC_write_access); \
+ return 0; \
+ } \
+ } while ( 0 )
+
+ if ( ((ar >> 13) & 3) < (regs->cs & 3) )
+ {
+ sel |= (ar >> 13) & 3;
+ /* Inner stack known only for kernel ring. */
+ if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+ esp = v->arch.guest_context.kernel_sp;
+ ss = v->arch.guest_context.kernel_ss;
+ if ( (ss & 3) != (sel & 3) ||
+ !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
+ ((ar >> 13) & 3) != (sel & 3) ||
+ !(ar & _SEGMENT_S) ||
+ (ar & _SEGMENT_CODE) ||
+ !(ar & _SEGMENT_WR) )
+ {
+ regs->error_code = ss & ~3;
+ return do_guest_trap(TRAP_invalid_tss, regs, 1);
+ }
+ if ( !(ar & _SEGMENT_P) ||
+ !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
+ {
+ regs->error_code = ss & ~3;
+ return do_guest_trap(TRAP_stack_error, regs, 1);
+ }
+ stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
+ if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+ push(regs->ss);
+ push(regs->esp);
+ if ( nparm )
+ {
+ const unsigned int *ustkp;
+
+ if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) ||
+ ((ar >> 13) & 3) != (regs->cs & 3) ||
+ !(ar & _SEGMENT_S) ||
+ (ar & _SEGMENT_CODE) ||
+ !(ar & _SEGMENT_WR) ||
+ !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+ ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4);
+ if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+ do
+ {
+ unsigned int parm;
+
+ --ustkp;
+ rc = __get_user(parm, ustkp);
+ if ( rc )
+ {
+ propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0);
+ return 0;
+ }
+ push(parm);
+ } while ( --nparm );
+ }
+ }
+ else
+ {
+ sel |= (regs->cs & 3);
+ esp = regs->esp;
+ ss = regs->ss;
+ if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
+ ((ar >> 13) & 3) != (sel & 3) )
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+ if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
+ {
+ regs->error_code = 0;
+ return do_guest_trap(TRAP_stack_error, regs, 1);
+ }
+ stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
+ if ( !compat_access_ok(stkp - 2, 2 * 4) )
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+ }
+ push(regs->cs);
+ push(eip);
+#undef push
+ regs->esp = esp;
+ regs->ss = ss;
+ }
+ else
+ sel |= (regs->cs & 3);
+
+ regs->eip = off;
+ regs->cs = sel;
+#endif
+
+ return 0;
+}
+
asmlinkage int do_general_protection(struct cpu_user_regs *regs)
{
struct vcpu *v = current;
@@ -1805,6 +2194,8 @@ asmlinkage int do_general_protection(str
return do_guest_trap(vector, regs, 0);
}
}
+ else if ( is_pv_32on64_vcpu(v) && regs->error_code )
+ return emulate_gate_op(regs);
/* Emulate some simple privileged and I/O instructions. */
if ( (regs->error_code == 0) &&
Index: 2007-05-14/xen/arch/x86/x86_64/mm.c
===================================================================
--- 2007-05-14.orig/xen/arch/x86/x86_64/mm.c 2007-05-03 09:45:09.000000000 +0200
+++ 2007-05-14/xen/arch/x86/x86_64/mm.c 2007-05-14 14:40:03.000000000 +0200
@@ -372,14 +372,16 @@ int check_descriptor(const struct domain
{
u32 a = d->a, b = d->b;
u16 cs;
+ unsigned int dpl;
/* A not-present descriptor will always fault, so is safe. */
if ( !(b & _SEGMENT_P) )
goto good;
/* Check and fix up the DPL. */
- if ( (b & _SEGMENT_DPL) < (GUEST_KERNEL_RPL(dom) << 13) )
- d->b = b = (b & ~_SEGMENT_DPL) | (GUEST_KERNEL_RPL(dom) << 13);
+ dpl = (b >> 13) & 3;
+ __fixup_guest_selector(dom, dpl);
+ b = (b & ~_SEGMENT_DPL) | (dpl << 13);
/* All code and data segments are okay. No base/limit checking. */
if ( (b & _SEGMENT_S) )
@@ -397,18 +399,33 @@ int check_descriptor(const struct domain
if ( (b & _SEGMENT_TYPE) != 0xc00 )
goto bad;
- /* Validate and fix up the target code selector. */
+ /* Validate the target code selector. */
cs = a >> 16;
- fixup_guest_code_selector(dom, cs);
if ( !guest_gate_selector_okay(dom, cs) )
goto bad;
- a = d->a = (d->a & 0xffffU) | (cs << 16);
+#ifdef __x86_64__
+ /*
+ * Force DPL to zero, causing a GP fault with its error code indicating
+ * the gate in use, allowing emulation. This is necessary because with
+ * native guests (kernel in ring 3) call gates cannot be used directly
+ * to transition from user to kernel mode (and whether a gate is used
+ * to enter the kernel can only be determined when the gate is being
+ * used), and with compat guests call gates cannot be used at all as
+ * there are only 64-bit ones.
+ * Store the original DPL in the selector's RPL field.
+ */
+ b &= ~_SEGMENT_DPL;
+ cs = (cs & ~3) | dpl;
+#endif
+ a = (a & 0xffffU) | (cs << 16);
/* Reserved bits must be zero. */
- if ( (b & 0xe0) != 0 )
+ if ( b & (CONFIG_PAGING_LEVELS < 4 || is_pv_32on64_domain(dom) ? 0xe0 : 0xff) )
goto bad;
good:
+ d->a = a;
+ d->b = b;
return 1;
bad:
return 0;