Index: 2007-05-14/xen/arch/x86/traps.c =================================================================== --- 2007-05-14.orig/xen/arch/x86/traps.c 2007-05-14 14:39:42.000000000 +0200 +++ 2007-05-14/xen/arch/x86/traps.c 2007-05-14 14:40:03.000000000 +0200 @@ -1091,6 +1091,63 @@ static int read_descriptor(unsigned int return 1; } +#ifdef CONFIG_COMPAT/* XXX __x86_64__ */ +static int read_gate_descriptor(unsigned int gate_sel, + const struct vcpu *v, + unsigned int *sel, + unsigned long *off, + unsigned int *ar) +{ + struct desc_struct desc; + const struct desc_struct *pdesc; + + + pdesc = (const struct desc_struct *)(!(gate_sel & 4) ? + GDT_VIRT_START(v) : + LDT_VIRT_START(v)) + + (gate_sel >> 3); + if ( gate_sel < 4 || + (gate_sel >= FIRST_RESERVED_GDT_BYTE && !(gate_sel & 4)) || + __get_user(desc, pdesc) ) + return 0; + + *sel = (desc.a >> 16) & 0x0000fffc; + *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000); + *ar = desc.b & 0x0000ffff; + /* + * check_descriptor() clears the DPL field and stores the + * guest requested DPL in the selector's RPL field. + */ + ASSERT(!(*ar & _SEGMENT_DPL)); + *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL; + + if ( !is_pv_32on64_vcpu(v) ) + { + if ( (*ar & 0x1f00) != 0x0c00 || + (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) || + __get_user(desc, pdesc + 1) || + (desc.b & 0x1f00) ) + return 0; + + *off |= (unsigned long)desc.a << 32; + return 1; + } + + switch ( *ar & 0x1f00 ) + { + case 0x0400: + *off &= 0xffff; + break; + case 0x0c00: + break; + default: + return 0; + } + + return 1; +} +#endif + /* Has the guest requested sufficient permission for this I/O access? */ static inline int guest_io_okay( unsigned int port, unsigned int bytes, @@ -1158,6 +1215,8 @@ unsigned long guest_to_host_gpr_switch(u #define insn_fetch(type, base, eip, limit) \ ({ unsigned long _rc, _ptr = (base) + (eip); \ type _x; \ + if ( ad_default < 8 ) \ + _ptr = (unsigned int)_ptr; \ if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \ goto fail; \ if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \ @@ -1760,6 +1819,336 @@ static int emulate_privileged_op(struct return 0; } +static inline int check_stack_limit(unsigned int ar, unsigned int limit, + unsigned int esp, unsigned int decr) +{ + return esp - decr < esp - 1 && + (!(ar & _SEGMENT_EC) ? esp - 1 <= limit : esp - decr > limit); +} + +static int emulate_gate_op(struct cpu_user_regs *regs) +{ +#ifdef CONFIG_COMPAT/* XXX __x86_64__ */ + struct vcpu *v = current; + unsigned int sel, ar, dpl, nparm, opnd_sel; + unsigned int op_default, op_bytes, ad_default, ad_bytes; + unsigned long off, eip, opnd_off, base, limit; + int jump; + + /* Check whether this fault is due to the use of a call gate. */ + if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) || + ((ar >> 13) & 3) < (regs->cs & 3) || + (ar & _SEGMENT_TYPE) != 0xc00 ) + return do_guest_trap(TRAP_gp_fault, regs, 1); + if ( !(ar & _SEGMENT_P) ) + return do_guest_trap(TRAP_no_segment, regs, 1); + dpl = (ar >> 13) & 3; + nparm = ar & 0x1f; + + /* + * Decode instruction (and perhaps operand) to determine RPL, + * whether this is a jump or a call, and the call return offset. + */ + if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) || + !(ar & _SEGMENT_S) || + !(ar & _SEGMENT_P) || + !(ar & _SEGMENT_CODE) ) + return do_guest_trap(TRAP_gp_fault, regs, 1); + + op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2; + ad_default = ad_bytes = op_default; + opnd_sel = opnd_off = 0; + jump = -1; + for ( eip = regs->eip; eip - regs->_eip < 10; ) + { + switch ( insn_fetch(u8, base, eip, limit) ) + { + case 0x66: /* operand-size override */ + op_bytes = op_default ^ 6; /* switch between 2/4 bytes */ + continue; + case 0x67: /* address-size override */ + ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */ + continue; + case 0x2e: /* CS override */ + opnd_sel = regs->cs; + ASSERT(opnd_sel); + continue; + case 0x3e: /* DS override */ + opnd_sel = read_sreg(regs, ds); + if ( !opnd_sel ) + opnd_sel = dpl; + continue; + case 0x26: /* ES override */ + opnd_sel = read_sreg(regs, es); + if ( !opnd_sel ) + opnd_sel = dpl; + continue; + case 0x64: /* FS override */ + opnd_sel = read_sreg(regs, fs); + if ( !opnd_sel ) + opnd_sel = dpl; + continue; + case 0x65: /* GS override */ + opnd_sel = read_sreg(regs, gs); + if ( !opnd_sel ) + opnd_sel = dpl; + continue; + case 0x36: /* SS override */ + opnd_sel = regs->ss; + if ( !opnd_sel ) + opnd_sel = dpl; + continue; + case 0xea: + ++jump; + /* FALLTHROUGH */ + case 0x9a: + ++jump; + opnd_sel = regs->cs; + opnd_off = eip; + ad_bytes = ad_default; + eip += op_bytes + 2; + break; + case 0xff: + { + unsigned int modrm; + + switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 ) + { + case 0x28: case 0x68: case 0xa8: + ++jump; + /* FALLTHROUGH */ + case 0x18: case 0x58: case 0x98: + ++jump; + if ( ad_bytes != 2 ) + { + if ( (modrm & 7) == 4 ) + { + unsigned int sib = insn_fetch(u8, base, eip, limit); + + modrm = (modrm & ~7) | (sib & 7); + if ( (sib >>= 3) != 4 ) + opnd_off = *(unsigned long *)decode_register(sib & 7, regs, 0); + opnd_off <<= sib >> 3; + } + if ( (modrm & 7) != 5 || (modrm & 0xc0) ) + opnd_off += *(unsigned long *)decode_register(modrm & 7, regs, 0); + else + modrm |= 0x87; + if ( !opnd_sel ) + { + switch ( modrm & 7 ) + { + default: + opnd_sel = read_sreg(regs, ds); + break; + case 4: case 5: + opnd_sel = regs->ss; + break; + } + } + } + else + { + switch ( modrm & 7 ) + { + case 0: case 1: case 7: + opnd_off = regs->ebx; + break; + case 6: + if ( !(modrm & 0xc0) ) + modrm |= 0x80; + else + case 2: case 3: + { + opnd_off = regs->ebp; + if ( !opnd_sel ) + opnd_sel = regs->ss; + } + break; + } + if ( !opnd_sel ) + opnd_sel = read_sreg(regs, ds); + switch ( modrm & 7 ) + { + case 0: case 2: case 4: + opnd_off += regs->esi; + break; + case 1: case 3: case 5: + opnd_off += regs->edi; + break; + } + } + switch ( modrm & 0xc0 ) + { + case 0x40: + opnd_off += insn_fetch(s8, base, eip, limit); + break; + case 0x80: + opnd_off += insn_fetch(s32, base, eip, limit); + break; + } + if ( ad_bytes == 4 ) + opnd_off = (unsigned int)opnd_off; + else if ( ad_bytes == 2 ) + opnd_off = (unsigned short)opnd_off; + break; + } + } + break; + } + break; + } + + if ( jump < 0 ) + { + fail: + return do_guest_trap(TRAP_gp_fault, regs, 1); + } + + if ( (opnd_sel != regs->cs && + !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) || + !(ar & _SEGMENT_S) || + !(ar & _SEGMENT_P) || + ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) ) + return do_guest_trap(TRAP_gp_fault, regs, 1); + + opnd_off += op_bytes; +#define ad_default ad_bytes + opnd_sel = insn_fetch(u16, base, opnd_off, limit); +#undef ad_default + ASSERT((opnd_sel & ~3) == regs->error_code); + if ( dpl < (opnd_sel & 3) ) + return do_guest_trap(TRAP_gp_fault, regs, 1); + + if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) || + !(ar & _SEGMENT_S) || + !(ar & _SEGMENT_CODE) || + (!jump || (ar & _SEGMENT_EC) ? + ((ar >> 13) & 3) > (regs->cs & 3) : + ((ar >> 13) & 3) != (regs->cs & 3)) ) + { + regs->error_code = sel; + return do_guest_trap(TRAP_gp_fault, regs, 1); + } + if ( !(ar & _SEGMENT_P) ) + { + regs->error_code = sel; + return do_guest_trap(TRAP_no_segment, regs, 1); + } + if ( off > limit ) + { + regs->error_code = 0; + return do_guest_trap(TRAP_gp_fault, regs, 1); + } + + if ( !jump ) + { + unsigned int ss, esp, *stkp; + int rc; +#define push(item) do \ + { \ + --stkp; \ + esp -= 4; \ + rc = __put_user(item, stkp); \ + if ( rc ) \ + { \ + propagate_page_fault((unsigned long)(stkp + 1) - rc, \ + PFEC_write_access); \ + return 0; \ + } \ + } while ( 0 ) + + if ( ((ar >> 13) & 3) < (regs->cs & 3) ) + { + sel |= (ar >> 13) & 3; + /* Inner stack known only for kernel ring. */ + if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) ) + return do_guest_trap(TRAP_gp_fault, regs, 1); + esp = v->arch.guest_context.kernel_sp; + ss = v->arch.guest_context.kernel_ss; + if ( (ss & 3) != (sel & 3) || + !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) || + ((ar >> 13) & 3) != (sel & 3) || + !(ar & _SEGMENT_S) || + (ar & _SEGMENT_CODE) || + !(ar & _SEGMENT_WR) ) + { + regs->error_code = ss & ~3; + return do_guest_trap(TRAP_invalid_tss, regs, 1); + } + if ( !(ar & _SEGMENT_P) || + !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) ) + { + regs->error_code = ss & ~3; + return do_guest_trap(TRAP_stack_error, regs, 1); + } + stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp); + if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) ) + return do_guest_trap(TRAP_gp_fault, regs, 1); + push(regs->ss); + push(regs->esp); + if ( nparm ) + { + const unsigned int *ustkp; + + if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) || + ((ar >> 13) & 3) != (regs->cs & 3) || + !(ar & _SEGMENT_S) || + (ar & _SEGMENT_CODE) || + !(ar & _SEGMENT_WR) || + !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) ) + return do_guest_trap(TRAP_gp_fault, regs, 1); + ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4); + if ( !compat_access_ok(ustkp - nparm, nparm * 4) ) + return do_guest_trap(TRAP_gp_fault, regs, 1); + do + { + unsigned int parm; + + --ustkp; + rc = __get_user(parm, ustkp); + if ( rc ) + { + propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0); + return 0; + } + push(parm); + } while ( --nparm ); + } + } + else + { + sel |= (regs->cs & 3); + esp = regs->esp; + ss = regs->ss; + if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) || + ((ar >> 13) & 3) != (sel & 3) ) + return do_guest_trap(TRAP_gp_fault, regs, 1); + if ( !check_stack_limit(ar, limit, esp, 2 * 4) ) + { + regs->error_code = 0; + return do_guest_trap(TRAP_stack_error, regs, 1); + } + stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp); + if ( !compat_access_ok(stkp - 2, 2 * 4) ) + return do_guest_trap(TRAP_gp_fault, regs, 1); + } + push(regs->cs); + push(eip); +#undef push + regs->esp = esp; + regs->ss = ss; + } + else + sel |= (regs->cs & 3); + + regs->eip = off; + regs->cs = sel; +#endif + + return 0; +} + asmlinkage int do_general_protection(struct cpu_user_regs *regs) { struct vcpu *v = current; @@ -1805,6 +2194,8 @@ asmlinkage int do_general_protection(str return do_guest_trap(vector, regs, 0); } } + else if ( is_pv_32on64_vcpu(v) && regs->error_code ) + return emulate_gate_op(regs); /* Emulate some simple privileged and I/O instructions. */ if ( (regs->error_code == 0) && Index: 2007-05-14/xen/arch/x86/x86_64/mm.c =================================================================== --- 2007-05-14.orig/xen/arch/x86/x86_64/mm.c 2007-05-03 09:45:09.000000000 +0200 +++ 2007-05-14/xen/arch/x86/x86_64/mm.c 2007-05-14 14:40:03.000000000 +0200 @@ -372,14 +372,16 @@ int check_descriptor(const struct domain { u32 a = d->a, b = d->b; u16 cs; + unsigned int dpl; /* A not-present descriptor will always fault, so is safe. */ if ( !(b & _SEGMENT_P) ) goto good; /* Check and fix up the DPL. */ - if ( (b & _SEGMENT_DPL) < (GUEST_KERNEL_RPL(dom) << 13) ) - d->b = b = (b & ~_SEGMENT_DPL) | (GUEST_KERNEL_RPL(dom) << 13); + dpl = (b >> 13) & 3; + __fixup_guest_selector(dom, dpl); + b = (b & ~_SEGMENT_DPL) | (dpl << 13); /* All code and data segments are okay. No base/limit checking. */ if ( (b & _SEGMENT_S) ) @@ -397,18 +399,33 @@ int check_descriptor(const struct domain if ( (b & _SEGMENT_TYPE) != 0xc00 ) goto bad; - /* Validate and fix up the target code selector. */ + /* Validate the target code selector. */ cs = a >> 16; - fixup_guest_code_selector(dom, cs); if ( !guest_gate_selector_okay(dom, cs) ) goto bad; - a = d->a = (d->a & 0xffffU) | (cs << 16); +#ifdef __x86_64__ + /* + * Force DPL to zero, causing a GP fault with its error code indicating + * the gate in use, allowing emulation. This is necessary because with + * native guests (kernel in ring 3) call gates cannot be used directly + * to transition from user to kernel mode (and whether a gate is used + * to enter the kernel can only be determined when the gate is being + * used), and with compat guests call gates cannot be used at all as + * there are only 64-bit ones. + * Store the original DPL in the selector's RPL field. + */ + b &= ~_SEGMENT_DPL; + cs = (cs & ~3) | dpl; +#endif + a = (a & 0xffffU) | (cs << 16); /* Reserved bits must be zero. */ - if ( (b & 0xe0) != 0 ) + if ( b & (CONFIG_PAGING_LEVELS < 4 || is_pv_32on64_domain(dom) ? 0xe0 : 0xff) ) goto bad; good: + d->a = a; + d->b = b; return 1; bad: return 0;