--- a/xen/arch/ia64/linux-xen/smp.c
+++ b/xen/arch/ia64/linux-xen/smp.c
@@ -94,6 +94,7 @@ static volatile struct call_data_struct 
 
 #define IPI_CALL_FUNC		0
 #define IPI_CPU_STOP		1
+#define IPI_STATE_DUMP		2
 
 /* This needs to be cacheline aligned because it is written to by *other* CPUs.  */
 static DEFINE_PER_CPU(u64, ipi_operation) ____cacheline_aligned;
@@ -202,6 +203,10 @@ handle_IPI (int irq, void *dev_id, struc
 				stop_this_cpu();
 				break;
 
+			      case IPI_STATE_DUMP:
+				dump_execstate(regs);
+				break;
+
 			      default:
 				printk(KERN_CRIT "Unknown IPI on CPU %d: %lu\n", this_cpu, which);
 				break;
@@ -479,6 +484,12 @@ smp_send_stop (void)
 	send_IPI_allbutself(IPI_CPU_STOP);
 }
 
+void
+smp_send_state_dump (unsigned int cpu)
+{
+	send_IPI_single(cpu, IPI_STATE_DUMP);
+}
+
 int __init
 setup_profiling_timer (unsigned int multiplier)
 {
--- a/xen/arch/x86/smp.c
+++ b/xen/arch/x86/smp.c
@@ -375,11 +375,24 @@ void smp_send_nmi_allbutself(void)
     send_IPI_mask(&cpu_online_map, APIC_DM_NMI);
 }
 
+void smp_send_state_dump(unsigned int cpu)
+{
+    state_dump_pending(cpu) = 1;
+    smp_send_event_check_cpu(cpu);
+}
+
 fastcall void smp_event_check_interrupt(struct cpu_user_regs *regs)
 {
     struct cpu_user_regs *old_regs = set_irq_regs(regs);
     ack_APIC_irq();
     perfc_incr(ipis);
+    if ( unlikely(state_dump_pending(smp_processor_id())) )
+    {
+        irq_enter();
+        state_dump_pending(smp_processor_id()) = 0;
+        dump_execstate(regs);
+        irq_exit();
+    }
     set_irq_regs(old_regs);
 }
 
--- a/xen/common/keyhandler.c
+++ b/xen/common/keyhandler.c
@@ -71,19 +71,52 @@ static struct keyhandler show_handlers_k
     .desc = "show this message"
 };
 
-static void __dump_execstate(void *unused)
+#ifdef CONFIG_SMP
+static cpumask_t dump_execstate_mask;
+#endif
+
+void dump_execstate(struct cpu_user_regs *regs)
 {
-    dump_execution_state();
-    printk("*** Dumping CPU%d guest state: ***\n", smp_processor_id());
+    unsigned int cpu = smp_processor_id();
+
+    if ( !guest_mode(regs) )
+    {
+        printk("\n*** Dumping CPU%u host state: ***\n", cpu);
+        show_execution_state(regs);
+    }
     if ( is_idle_vcpu(current) )
-        printk("No guest context (CPU is idle).\n");
+        printk("No guest context (CPU%u is idle).\n", cpu);
     else
+    {
+        printk("*** Dumping CPU%u guest state (d%d:v%d): ***\n",
+               smp_processor_id(), current->domain->domain_id,
+               current->vcpu_id);
         show_execution_state(guest_cpu_user_regs());
+    }
+
+#ifdef CONFIG_SMP
+    cpu_clear(cpu, dump_execstate_mask);
+    if ( !alt_key_handling )
+        return;
+
+    cpu = cycle_cpu(cpu, dump_execstate_mask);
+    if ( cpu < NR_CPUS )
+        smp_send_state_dump(cpu);
+    else
+    {
+        printk("\n");
+
+        console_end_sync();
+        watchdog_enable();
+    }
+#endif
 }
 
 static void dump_registers(unsigned char key, struct cpu_user_regs *regs)
 {
+#ifdef CONFIG_SMP
     unsigned int cpu;
+#endif
 
     /* We want to get everything out that we possibly can. */
     watchdog_disable();
@@ -91,17 +124,28 @@ static void dump_registers(unsigned char
 
     printk("'%c' pressed -> dumping registers\n", key);
 
+#ifdef CONFIG_SMP
+    if ( alt_key_handling )
+        dump_execstate_mask = cpu_online_map;
+#endif
+
     /* Get local execution state out immediately, in case we get stuck. */
-    printk("\n*** Dumping CPU%d host state: ***\n", smp_processor_id());
-    __dump_execstate(NULL);
+    dump_execstate(regs);
+
+#ifdef CONFIG_SMP
+    if ( alt_key_handling )
+        return;
 
     for_each_online_cpu ( cpu )
     {
         if ( cpu == smp_processor_id() )
             continue;
-        printk("\n*** Dumping CPU%d host state: ***\n", cpu);
-        on_selected_cpus(cpumask_of(cpu), __dump_execstate, NULL, 1);
+        cpu_set(cpu, dump_execstate_mask);
+        smp_send_state_dump(cpu);
+        while ( cpu_isset(cpu, dump_execstate_mask) )
+            cpu_relax();
     }
+#endif
 
     printk("\n");
 
--- a/xen/include/asm-ia64/linux-xen/asm/ptrace.h
+++ b/xen/include/asm-ia64/linux-xen/asm/ptrace.h
@@ -278,7 +278,7 @@ struct switch_stack {
 # define ia64_task_regs(t)		(((struct pt_regs *) ((char *) (t) + IA64_STK_OFFSET)) - 1)
 # define ia64_psr(regs)			((struct ia64_psr *) &(regs)->cr_ipsr)
 #ifdef XEN
-# define guest_mode(regs)		(ia64_psr(regs)->cpl != 0)
+# define guest_mode(regs)		(ia64_psr(regs)->cpl && !ia64_psr(regs)->vm)
 # define guest_kernel_mode(regs)	(ia64_psr(regs)->cpl == CONFIG_CPL0_EMUL)
 # define vmx_guest_kernel_mode(regs)	(ia64_psr(regs)->cpl == 0)
 # define regs_increment_iip(regs)					\
--- a/xen/include/asm-x86/hardirq.h
+++ b/xen/include/asm-x86/hardirq.h
@@ -8,6 +8,7 @@ typedef struct {
 	unsigned long __softirq_pending;
 	unsigned int __local_irq_count;
 	unsigned int __nmi_count;
+	bool_t __state_dump_pending;
 } __cacheline_aligned irq_cpustat_t;
 
 #include <xen/irq_cpustat.h>	/* Standard mappings for irq_cpustat_t above */
--- a/xen/include/xen/irq_cpustat.h
+++ b/xen/include/xen/irq_cpustat.h
@@ -26,5 +26,6 @@ extern irq_cpustat_t irq_stat[];
 #define softirq_pending(cpu)	__IRQ_STAT((cpu), __softirq_pending)
 #define local_irq_count(cpu)	__IRQ_STAT((cpu), __local_irq_count)
 #define nmi_count(cpu)		__IRQ_STAT((cpu), __nmi_count)
+#define state_dump_pending(cpu)	__IRQ_STAT((cpu), __state_dump_pending)
 
 #endif	/* __irq_cpustat_h */
--- a/xen/include/xen/lib.h
+++ b/xen/include/xen/lib.h
@@ -111,4 +111,7 @@ extern int tainted;
 extern char *print_tainted(char *str);
 extern void add_taint(unsigned);
 
+struct cpu_user_regs;
+void dump_execstate(struct cpu_user_regs *);
+
 #endif /* __LIB_H__ */
--- a/xen/include/xen/smp.h
+++ b/xen/include/xen/smp.h
@@ -13,6 +13,8 @@ extern void smp_send_event_check_mask(co
 #define smp_send_event_check_cpu(cpu) \
     smp_send_event_check_mask(cpumask_of(cpu))
 
+extern void smp_send_state_dump(unsigned int cpu);
+
 /*
  * Prepare machine for booting other CPUs.
  */