From: Juergen Gross <juergen.gross@ts.fujitsu.com>

--- a/xen/arch/x86/acpi/power.c
+++ b/xen/arch/x86/acpi/power.c
@@ -234,7 +234,7 @@ static int enter_state(u32 state)
     return error;
 }
 
-static long enter_state_helper(void *data)
+static long enter_state_helper(void *hdl, void *data)
 {
     struct acpi_sleep_info *sinfo = (struct acpi_sleep_info *)data;
     return enter_state(sinfo->sleep_state);
@@ -265,7 +265,7 @@ int acpi_enter_sleep(struct xenpf_enter_
     acpi_sinfo.pm1b_cnt_val = sleep->pm1b_cnt_val;
     acpi_sinfo.sleep_state = sleep->sleep_state;
 
-    return continue_hypercall_on_cpu(0, enter_state_helper, &acpi_sinfo);
+    return continue_hypercall_on_cpu(0, NULL, enter_state_helper, &acpi_sinfo);
 }
 
 static int acpi_get_wake_status(void)
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1522,42 +1522,52 @@ void sync_vcpu_execstate(struct vcpu *v)
 }
 
 struct migrate_info {
-    long (*func)(void *data);
+    struct tasklet tasklet;
+    long (*func)(void *hdl, void *data);
     void *data;
     void (*saved_schedule_tail)(struct vcpu *);
-    cpumask_t saved_affinity;
-    unsigned int nest;
+    volatile int nest;
+    long ret;
+    struct vcpu *v;
 };
 
 static void continue_hypercall_on_cpu_helper(struct vcpu *v)
 {
     struct cpu_user_regs *regs = guest_cpu_user_regs();
     struct migrate_info *info = v->arch.continue_info;
-    cpumask_t mask = info->saved_affinity;
     void (*saved_schedule_tail)(struct vcpu *) = info->saved_schedule_tail;
 
-    regs->eax = info->func(info->data);
+    regs->eax = info->ret;
 
-    if ( info->nest-- == 0 )
-    {
-        xfree(info);
-        v->arch.schedule_tail = saved_schedule_tail;
-        v->arch.continue_info = NULL;
-        vcpu_unlock_affinity(v, &mask);
-    }
+    tasklet_kill(&info->tasklet);
+    xfree(info);
+    v->arch.schedule_tail = saved_schedule_tail;
+    v->arch.continue_info = NULL;
 
     (*saved_schedule_tail)(v);
 }
 
-int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data)
+static void continue_hypercall_on_cpu_tasklet(struct migrate_info *info)
+{
+    info->ret = info->func((void *)info, info->data);
+
+    if ( info->nest-- == 0 )
+        vcpu_unpause(info->v);
+
+    return;
+}
+
+int continue_hypercall_on_cpu(int cpu, void *hdl,
+                              long (*func)(void *hdl, void *data), void *data)
 {
     struct vcpu *v = current;
-    struct migrate_info *info;
-    cpumask_t mask = cpumask_of_cpu(cpu);
-    int rc;
+    struct migrate_info *info = (struct migrate_info *)hdl;
 
     if ( cpu == smp_processor_id() )
-        return func(data);
+        return func(info, data);
+
+    if ( info != NULL )
+        v = info->v;
 
     info = v->arch.continue_info;
     if ( info == NULL )
@@ -1566,16 +1576,12 @@ int continue_hypercall_on_cpu(int cpu, l
         if ( info == NULL )
             return -ENOMEM;
 
-        rc = vcpu_lock_affinity(v, &mask);
-        if ( rc )
-        {
-            xfree(info);
-            return rc;
-        }
-
         info->saved_schedule_tail = v->arch.schedule_tail;
-        info->saved_affinity = mask;
         info->nest = 0;
+        info->v = v;
+        tasklet_init(&info->tasklet,
+                     (void(*)(unsigned long))continue_hypercall_on_cpu_tasklet,
+                     (unsigned long)info);
 
         v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
         v->arch.continue_info = info;
@@ -1583,17 +1589,17 @@ int continue_hypercall_on_cpu(int cpu, l
     else
     {
         BUG_ON(info->nest != 0);
-        rc = vcpu_locked_change_affinity(v, &mask);
-        if ( rc )
-            return rc;
         info->nest++;
     }
 
     info->func = func;
     info->data = data;
 
+    vcpu_pause_nosync(v);
+    tasklet_schedule_cpu(&info->tasklet, cpu);
+    raise_softirq(SCHEDULE_SOFTIRQ);
+
     /* Dummy return value will be overwritten by new schedule_tail. */
-    BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
     return 0;
 }
 
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -9,6 +9,7 @@
 #include <xen/lib.h>
 #include <xen/ctype.h>
 #include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/smp.h>
 #include <xen/delay.h>
 #include <xen/event.h>
@@ -84,7 +85,7 @@ integer_param("dom0_max_vcpus", opt_dom0
 struct vcpu *__init alloc_dom0_vcpu0(void)
 {
     if ( opt_dom0_max_vcpus == 0 )
-        opt_dom0_max_vcpus = num_online_cpus();
+        opt_dom0_max_vcpus = num_cpupool_cpus(cpupool0);
     if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS )
         opt_dom0_max_vcpus = MAX_VIRT_CPUS;
 
@@ -287,7 +288,7 @@ int __init construct_dom0(
     unsigned long _initrd_start, unsigned long initrd_len,
     char *cmdline)
 {
-    int i, rc, compatible, compat32, order, machine;
+    int i, cpu, rc, compatible, compat32, order, machine;
     struct cpu_user_regs *regs;
     unsigned long pfn, mfn;
     unsigned long nr_pages;
@@ -786,8 +787,12 @@ int __init construct_dom0(
 
     printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus);
 
+    cpu = first_cpu(cpupool0->cpu_valid);
     for ( i = 1; i < opt_dom0_max_vcpus; i++ )
-        (void)alloc_vcpu(d, i, i % num_online_cpus());
+    {
+        cpu = cycle_cpu(cpu, cpupool0->cpu_valid);
+        (void)alloc_vcpu(d, i, cpu);
+    }
 
     /* Set up CR3 value for write_ptbase */
     if ( paging_mode_enabled(d) )
--- a/xen/arch/x86/microcode.c
+++ b/xen/arch/x86/microcode.c
@@ -114,7 +114,7 @@ static int microcode_update_cpu(const vo
     return err;
 }
 
-static long do_microcode_update(void *_info)
+static long do_microcode_update(void *hdl, void *_info)
 {
     struct microcode_info *info = _info;
     int error;
@@ -127,7 +127,8 @@ static long do_microcode_update(void *_i
 
     info->cpu = next_cpu(info->cpu, cpu_online_map);
     if ( info->cpu < NR_CPUS )
-        return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
+        return continue_hypercall_on_cpu(info->cpu, hdl,
+                                         do_microcode_update, info);
 
     error = info->error;
     xfree(info);
@@ -160,5 +161,6 @@ int microcode_update(XEN_GUEST_HANDLE(co
     info->error = 0;
     info->cpu = first_cpu(cpu_online_map);
 
-    return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
+    return continue_hypercall_on_cpu(info->cpu, NULL,
+                                     do_microcode_update, info);
 }
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -243,7 +243,7 @@ void __init arch_init_memory(void)
      * Any Xen-heap pages that we will allow to be mapped will have
      * their domain field set to dom_xen.
      */
-    dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
+    dom_xen = domain_create(DOMID_XEN, CPUPOOLID_NONE, DOMCRF_dummy, 0);
     BUG_ON(dom_xen == NULL);
 
     /*
@@ -251,14 +251,14 @@ void __init arch_init_memory(void)
      * This domain owns I/O pages that are within the range of the page_info
      * array. Mappings occur at the priv of the caller.
      */
-    dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
+    dom_io = domain_create(DOMID_IO, CPUPOOLID_NONE, DOMCRF_dummy, 0);
     BUG_ON(dom_io == NULL);
     
     /*
      * Initialise our DOMID_IO domain.
      * This domain owns sharable pages.
      */
-    dom_cow = domain_create(DOMID_COW, DOMCRF_dummy, 0);
+    dom_cow = domain_create(DOMID_COW, CPUPOOLID_NONE, DOMCRF_dummy, 0);
     BUG_ON(dom_cow == NULL);
 
     /* First 1MB of RAM is historically marked as I/O. */
--- a/xen/arch/x86/platform_hypercall.c
+++ b/xen/arch/x86/platform_hypercall.c
@@ -48,12 +48,12 @@ static DEFINE_PER_CPU(uint64_t, freq);
 extern int set_px_pminfo(uint32_t cpu, struct xen_processor_performance *perf);
 extern long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power);
 
-static long cpu_frequency_change_helper(void *data)
+static long cpu_frequency_change_helper(void *hdl, void *data)
 {
     return cpu_frequency_change(this_cpu(freq));
 }
 
-static long cpu_down_helper(void *data)
+static long cpu_down_helper(void *hdl, void *data)
 {
     int cpu = (unsigned long)data;
     return cpu_down(cpu);
@@ -314,7 +314,7 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
         if ( op->u.change_freq.flags || !cpu_online(op->u.change_freq.cpu) )
             break;
         per_cpu(freq, op->u.change_freq.cpu) = op->u.change_freq.freq;
-        ret = continue_hypercall_on_cpu(op->u.change_freq.cpu,
+        ret = continue_hypercall_on_cpu(op->u.change_freq.cpu, NULL,
                                         cpu_frequency_change_helper,
                                         NULL);
         break;
@@ -470,7 +470,7 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
             break;
         }
         ret = continue_hypercall_on_cpu(
-          0, cpu_down_helper, (void *)(unsigned long)cpu);
+          0, NULL, cpu_down_helper, (void *)(unsigned long)cpu);
         break;
     }
     break;
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -2,6 +2,7 @@
 #include <xen/init.h>
 #include <xen/lib.h>
 #include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/domain.h>
 #include <xen/serial.h>
 #include <xen/softirq.h>
@@ -245,7 +246,7 @@ static void __init init_idle_domain(void
     /* Domain creation requires that scheduler structures are initialised. */
     scheduler_init();
 
-    idle_domain = domain_create(IDLE_DOMAIN_ID, 0, 0);
+    idle_domain = domain_create(IDLE_DOMAIN_ID, CPUPOOLID_NONE, 0, 0);
     if ( idle_domain == NULL )
         BUG();
     idle_domain->vcpu = idle_vcpu;
@@ -1122,8 +1123,13 @@ void __init __start_xen(unsigned long mb
     if ( !tboot_protect_mem_regions() )
         panic("Could not protect TXT memory regions\n");
 
+    /* Create initial cpupool 0. */
+    cpupool0 = cpupool_create(0, NULL);
+    if ( (cpupool0 == NULL) || cpupool0_cpu_assign(cpupool0) )
+        panic("Error creating cpupool 0\n");
+
     /* Create initial domain 0. */
-    dom0 = domain_create(0, DOMCRF_s3_integrity, DOM0_SSIDREF);
+    dom0 = domain_create(0, 0, DOMCRF_s3_integrity, DOM0_SSIDREF);
     if ( (dom0 == NULL) || (alloc_dom0_vcpu0() == NULL) )
         panic("Error creating domain 0\n");
 
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -39,6 +39,7 @@
 #include <xen/mm.h>
 #include <xen/domain.h>
 #include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/irq.h>
 #include <xen/delay.h>
 #include <xen/softirq.h>
@@ -1306,10 +1307,11 @@ int __cpu_disable(void)
 	__sync_lazy_execstate();
 
 	/* It's now safe to remove this processor from the online map */
+	cpu_clear(cpu, cpupool0->cpu_valid);
 	cpu_clear(cpu, cpu_online_map);
 	fixup_irqs();
 
-	cpu_disable_scheduler();
+	cpu_disable_scheduler(cpu, 0);
 
 	return 0;
 }
@@ -1341,16 +1343,12 @@ static int take_cpu_down(void *unused)
 int cpu_down(unsigned int cpu)
 {
 	int err = 0;
+	bool_t pool_rm = 0;
 
 	/* spin_trylock() avoids deadlock with stop_machine_run(). */
 	if (!spin_trylock(&cpu_add_remove_lock))
 		return -EBUSY;
 
-	if (num_online_cpus() == 1) {
-		err = -EBUSY;
-		goto out;
-	}
-
 	/* Can not offline BSP */
 	if (cpu == 0) {
 		err = -EINVAL;
@@ -1364,6 +1362,11 @@ int cpu_down(unsigned int cpu)
 
 	printk("Prepare to bring CPU%d down...\n", cpu);
 
+	err = cpupool_cpu_remove(cpu);
+	if (err)
+		goto out;
+	pool_rm = 1;
+
 	cpufreq_del_cpu(cpu);
 
 	err = stop_machine_run(take_cpu_down, NULL, cpu);
@@ -1379,6 +1382,8 @@ int cpu_down(unsigned int cpu)
 out:
 	if (!err)
 		send_guest_global_virq(dom0, VIRQ_PCPU_STATE);
+	else if (pool_rm)
+		cpupool_cpu_add(cpu);
 	spin_unlock(&cpu_add_remove_lock);
 	return err;
 }
@@ -1568,6 +1573,7 @@ int __devinit __cpu_up(unsigned int cpu)
 		process_pending_softirqs();
 	}
 
+	cpupool_cpu_add(cpu);
 	cpufreq_add_cpu(cpu);
 	return 0;
 }
--- a/xen/arch/x86/sysctl.c
+++ b/xen/arch/x86/sysctl.c
@@ -29,7 +29,7 @@
 
 #define get_xen_guest_handle(val, hnd)  do { val = (hnd).p; } while (0)
 
-static long cpu_down_helper(void *data)
+static long cpu_down_helper(void *hdl, void *data)
 {
     int cpu = (unsigned long)data;
     return cpu_down(cpu);
@@ -122,7 +122,7 @@ long arch_do_sysctl(
             break;
         case XEN_SYSCTL_CPU_HOTPLUG_OFFLINE:
             ret = continue_hypercall_on_cpu(
-                0, cpu_down_helper, (void *)(unsigned long)cpu);
+                0, NULL, cpu_down_helper, (void *)(unsigned long)cpu);
             break;
         case XEN_SYSCTL_CPU_HOTPLUG_STATUS:
             ret = 0;
--- a/xen/common/Makefile
+++ b/xen/common/Makefile
@@ -1,5 +1,6 @@
 obj-y += bitmap.o
 obj-y += cpu.o
+obj-y += cpupool.o
 obj-y += domctl.o
 obj-y += domain.o
 obj-y += event_channel.o
--- /dev/null
+++ b/xen/common/cpupool.c
@@ -0,0 +1,609 @@
+/******************************************************************************
+ * cpupool.c
+ *
+ * Generic cpupool-handling functions.
+ *
+ * Cpupools are a feature to have configurable scheduling domains. Each
+ * cpupool runs an own scheduler on a dedicated set of physical cpus.
+ * A domain is bound to one cpupool at any time, but it can be moved to
+ * another cpupool.
+ *
+ * (C) 2009, Juergen Gross, Fujitsu Technology Solutions
+ */
+
+#include <xen/lib.h>
+#include <xen/init.h>
+#include <xen/cpumask.h>
+#include <xen/percpu.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+
+#define for_each_cpupool(ptr)    \
+    for ((ptr) = &cpupool_list; *(ptr) != NULL; (ptr) = &((*(ptr))->next))
+
+struct cpupool *cpupool0;                /* Initial cpupool with Dom0 */
+cpumask_t cpupool_free_cpus;             /* cpus not in any cpupool */
+
+static struct cpupool *cpupool_list;     /* linked list, sorted by poolid */
+
+static int cpupool0_max_cpus;
+integer_param("pool0_max_cpus", cpupool0_max_cpus);
+
+static int cpupool_moving_cpu = -1;
+static struct cpupool *cpupool_cpu_moving = NULL;
+static cpumask_t cpupool_locked_cpus = CPU_MASK_NONE;
+
+/* cpupool lock: be carefull, this lock is sometimes released on another cpu
+ *               as it was obtained!
+ */
+static DEFINE_SPINLOCK(cpupool_lock);
+
+DEFINE_PER_CPU(struct cpupool *, cpupool);
+
+static struct cpupool *alloc_cpupool_struct(void)
+{
+    return xmalloc(struct cpupool);
+}
+
+static void free_cpupool_struct(struct cpupool *c)
+{
+    xfree(c);
+}
+
+/*
+ * find a cpupool by it's id. to be called with cpupool lock held
+ * if exact is not specified, the first cpupool with an id larger or equal to
+ * the searched id is returned
+ * returns NULL if not found.
+ */
+static struct cpupool *cpupool_find_by_id(int id, int exact)
+{
+    struct cpupool **q;
+
+    for_each_cpupool(q)
+    {
+        if ( (*q)->cpupool_id == id )
+            return *q;
+        if ( (*q)->cpupool_id > id )
+            break;
+    }
+    return exact ? NULL : *q;
+}
+
+/*
+ * create a new cpupool with specified poolid and scheduler
+ * returns pointer to new cpupool structure if okay, NULL else
+ * possible failures:
+ * - no memory
+ * - poolid already used
+ * - unknown scheduler
+ */
+struct cpupool *cpupool_create(int poolid, char *sched)
+{
+    struct cpupool *c;
+    struct cpupool **q;
+    int last = 0;
+
+    if ( (c = alloc_cpupool_struct()) == NULL )
+        return NULL;
+    memset(c, 0, sizeof(*c));
+
+    printk(XENLOG_DEBUG "cpupool_create(pool=%d,sched=%s)\n", poolid, sched);
+    spin_lock(&cpupool_lock);
+    for_each_cpupool(q)
+    {
+        last = (*q)->cpupool_id;
+        if ( (poolid != CPUPOOLID_NONE) && (last >= poolid) )
+            break;
+    }
+    if ( *q != NULL )
+    {
+        if ( (*q)->cpupool_id == poolid )
+        {
+            spin_unlock(&cpupool_lock);
+            free_cpupool_struct(c);
+            return NULL;
+        }
+        c->next = *q;
+    }
+    *q = c;
+    c->cpupool_id = (poolid == CPUPOOLID_NONE) ? (last + 1) : poolid;
+    if ( schedule_init_global(sched, &(c->sched)) )
+    {
+        spin_unlock(&cpupool_lock);
+        cpupool_destroy(c);
+        return NULL;
+    }
+    spin_unlock(&cpupool_lock);
+
+    printk("Created cpupool %d with scheduler %s (%s)\n", c->cpupool_id,
+        c->sched.name, c->sched.opt_name);
+
+    return c;
+}
+/*
+ * destroys the given cpupool
+ * returns 0 on success, 1 else
+ * possible failures:
+ * - pool still in use
+ * - cpus still assigned to pool
+ * - pool not in list
+ */
+int cpupool_destroy(struct cpupool *c)
+{
+    struct cpupool **q;
+
+    spin_lock(&cpupool_lock);
+    for_each_cpupool(q)
+        if ( *q == c )
+            break;
+    if ( (*q != c) || (c->n_dom != 0) || cpus_weight(c->cpu_valid) )
+    {
+        spin_unlock(&cpupool_lock);
+        return 1;
+    }
+    *q = c->next;
+    spin_unlock(&cpupool_lock);
+    printk(XENLOG_DEBUG "cpupool_destroy(pool=%d)\n", c->cpupool_id);
+    schedule_deinit_global(&(c->sched));
+    free_cpupool_struct(c);
+    return 0;
+}
+
+/*
+ * assign a specific cpu to a cpupool
+ * cpupool_lock must be held
+ */
+static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu)
+{
+    if ( (cpupool_moving_cpu == cpu) && (c != cpupool_cpu_moving) )
+        return -EBUSY;
+    per_cpu(cpupool, cpu) = c;
+    schedule_cpu_switch(cpu, c);
+    cpu_clear(cpu, cpupool_free_cpus);
+    if (cpupool_moving_cpu == cpu)
+    {
+        cpupool_moving_cpu = -1;
+        cpupool_cpu_moving = NULL;
+    }
+    cpu_set(cpu, c->cpu_valid);
+    return 0;
+}
+
+/*
+ * assign free physical cpus to a cpupool
+ * cpus assigned are unused cpus with lowest possible ids
+ * returns the number of cpus assigned
+ */
+int cpupool_assign_ncpu(struct cpupool *c, int ncpu)
+{
+    int i;
+    int n;
+
+    n = 0;
+    spin_lock(&cpupool_lock);
+    for_each_cpu_mask(i, cpupool_free_cpus)
+    {
+        if ( cpupool_assign_cpu_locked(c, i) == 0 )
+            n++;
+        if ( n == ncpu )
+            break;
+    }
+    spin_unlock(&cpupool_lock);
+    printk(XENLOG_DEBUG "cpupool_assign_ncpu(pool=%d,ncpu=%d) rc %d\n",
+        c->cpupool_id, ncpu, n);
+    return n;
+}
+
+static long cpupool_unassign_cpu_helper(void *hdl, void *info)
+{
+    struct cpupool *c = (struct cpupool *)info;
+    int cpu = cpupool_moving_cpu;
+    long ret;
+    int cpupool_id = c->cpupool_id;
+
+    ret = cpu_disable_scheduler(cpu, 1);
+    cpu_set(cpu, cpupool_free_cpus);
+    if ( !ret )
+    {
+        schedule_cpu_switch(cpu, NULL);
+        per_cpu(cpupool, cpu) = NULL;
+        cpupool_moving_cpu = -1;
+        cpupool_cpu_moving = NULL;
+    }
+    spin_unlock(&cpupool_lock);
+    printk(XENLOG_DEBUG "cpupool_unassign_cpu(pool=%d,cpu=%d) ret %ld\n",
+        cpupool_id, cpu, ret);
+    return ret;
+}
+
+/*
+ * unassign a specific cpu from a cpupool
+ * we must be sure not to run on the cpu to be unassigned! to achieve this
+ * the main functionality is performed via continue_hypercall_on_cpu on a
+ * specific cpu.
+ * if the cpu to be removed is the last one of the cpupool no active domain
+ * must be bound to the cpupool. dying domains are moved to cpupool0 as they
+ * might be zombies.
+ * possible failures:
+ * - last cpu and still active domains in cpupool
+ * - cpu just being unplugged
+ */
+static int cpupool_unassign_cpu(struct cpupool *c, unsigned int cpu)
+{
+    int work_cpu;
+    int ret;
+    struct domain *d;
+    int cpupool_id = c->cpupool_id;
+
+    printk(XENLOG_DEBUG "cpupool_unassign_cpu(pool=%d,cpu=%d)\n",
+        cpupool_id, cpu);
+    spin_lock(&cpupool_lock);
+    ret = -EBUSY;
+    if ( (cpupool_moving_cpu != -1) && (cpu != cpupool_moving_cpu) )
+        goto out;
+    if ( cpu_isset(cpu, cpupool_locked_cpus) )
+        goto out;
+
+    ret = 0;
+    if ( !cpu_isset(cpu, c->cpu_valid) && (cpu != cpupool_moving_cpu) )
+        goto out;
+
+    if ( (c->n_dom > 0) && (cpus_weight(c->cpu_valid) == 1) &&
+         (cpu != cpupool_moving_cpu) )
+    {
+        for_each_domain(d)
+        {
+            if ( d->cpupool != c )
+                continue;
+            if ( !d->is_dying )
+            {
+                ret = -EBUSY;
+                break;
+            }
+            c->n_dom--;
+            ret = sched_move_domain(d, cpupool0);
+            if ( ret )
+            {
+                c->n_dom++;
+                break;
+            }
+            cpupool0->n_dom++;
+        }
+        if ( ret )
+            goto out;
+    }
+    cpupool_moving_cpu = cpu;
+    cpupool_cpu_moving = c;
+    cpu_clear(cpu, c->cpu_valid);
+    work_cpu = smp_processor_id();
+    if ( work_cpu == cpu )
+    {
+        work_cpu = first_cpu(cpupool0->cpu_valid);
+        if ( work_cpu == cpu )
+            work_cpu = next_cpu(cpu, cpupool0->cpu_valid);
+    }
+    return continue_hypercall_on_cpu(work_cpu, NULL,
+                                     cpupool_unassign_cpu_helper, c);
+
+out:
+    spin_unlock(&cpupool_lock);
+    printk(XENLOG_DEBUG "cpupool_unassign_cpu(pool=%d,cpu=%d) ret %d\n",
+        cpupool_id, cpu, ret);
+    return ret;
+}
+
+/*
+ * assign cpus to the default cpupool
+ * default are all cpus, less cpus may be specified as boot parameter
+ * possible failures:
+ * - no cpu assigned
+ */
+int __init cpupool0_cpu_assign(struct cpupool *c)
+{
+    if ( (cpupool0_max_cpus == 0) || (cpupool0_max_cpus > num_online_cpus()) )
+        cpupool0_max_cpus = num_online_cpus();
+    if ( !cpupool_assign_ncpu(cpupool0, cpupool0_max_cpus) )
+        return 1;
+    return 0;
+}
+
+/*
+ * add a new domain to a cpupool
+ * possible failures:
+ * - pool does not exist
+ * - no cpu assigned to pool
+ */
+int cpupool_add_domain(struct domain *d, int poolid)
+{
+    struct cpupool *c;
+    int rc = 1;
+    int n_dom;
+
+    if ( poolid == CPUPOOLID_NONE )
+        return 0;
+    spin_lock(&cpupool_lock);
+    c = cpupool_find_by_id(poolid, 1);
+    if ( (c != NULL) && cpus_weight(c->cpu_valid) )
+    {
+        c->n_dom++;
+        n_dom = c->n_dom;
+        d->cpupool = c;
+        rc = 0;
+    }
+    spin_unlock(&cpupool_lock);
+    if (!rc)
+        printk(XENLOG_DEBUG "cpupool_add_domain(dom=%d,pool=%d) n_dom %d\n",
+            d->domain_id, poolid, n_dom);
+    return rc;
+}
+
+/*
+ * remove a domain from a cpupool
+ */
+void cpupool_rm_domain(struct domain *d)
+{
+    int cpupool_id;
+    int n_dom;
+
+    if ( d->cpupool == NULL )
+        return;
+    spin_lock(&cpupool_lock);
+    cpupool_id = d->cpupool->cpupool_id;
+    d->cpupool->n_dom--;
+    n_dom = d->cpupool->n_dom;
+    d->cpupool = NULL;
+    spin_unlock(&cpupool_lock);
+    printk(XENLOG_DEBUG "cpupool_rm_domain(dom=%d,pool=%d) n_dom %d\n",
+        d->domain_id, cpupool_id, n_dom);
+    return;
+}
+
+/*
+ * called to add a new cpu to pool admin
+ * we add a hotplugged cpu to the cpupool0 to be able to add it to dom0
+ */
+void cpupool_cpu_add(unsigned int cpu)
+{
+    if ( cpupool0 == NULL )
+        return;
+    spin_lock(&cpupool_lock);
+    cpu_clear(cpu, cpupool_locked_cpus);
+    cpu_set(cpu, cpupool_free_cpus);
+    (void)cpupool_assign_cpu_locked(cpupool0, cpu);
+    spin_unlock(&cpupool_lock);
+    return;
+}
+
+/*
+ * called to remove a cpu from pool admin
+ * the cpu to be removed is locked to avoid removing it from dom0
+ * returns failure if not in pool0
+ */
+int cpupool_cpu_remove(unsigned int cpu)
+{
+    int ret = 0;
+
+    spin_lock(&cpupool_lock);
+    if ( !cpu_isset(cpu, cpupool0->cpu_valid))
+        ret = -EBUSY;
+    else
+        cpu_set(cpu, cpupool_locked_cpus);
+    spin_unlock(&cpupool_lock);
+
+    return ret;
+}
+
+/*
+ * do cpupool related sysctl operations
+ */
+int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op)
+{
+    int ret;
+    struct cpupool *c;
+
+    switch ( op->op )
+    {
+
+    case XEN_SYSCTL_CPUPOOL_OP_CREATE:
+    {
+        int poolid;
+        struct scheduler *sched;
+
+        poolid = (op->cpupool_id == XEN_SYSCTL_CPUPOOL_PAR_ANY) ?
+            CPUPOOLID_NONE: op->cpupool_id;
+        sched = scheduler_get_by_id(op->sched_id);
+        ret = -ENOENT;
+        if ( sched == NULL )
+            break;
+        ret = 0;
+        c = cpupool_create(poolid, sched->opt_name);
+        if ( c == NULL )
+            ret = -EINVAL;
+        else
+            op->cpupool_id = c->cpupool_id;
+    }
+    break;
+
+    case XEN_SYSCTL_CPUPOOL_OP_DESTROY:
+    {
+        spin_lock(&cpupool_lock);
+        c = cpupool_find_by_id(op->cpupool_id, 1);
+        spin_unlock(&cpupool_lock);
+        ret = -ENOENT;
+        if ( c == NULL )
+            break;
+        ret = (cpupool_destroy(c) != 0) ? -EBUSY : 0;
+    }
+    break;
+
+    case XEN_SYSCTL_CPUPOOL_OP_INFO:
+    {
+        spin_lock(&cpupool_lock);
+        c = cpupool_find_by_id(op->cpupool_id, 0);
+        spin_unlock(&cpupool_lock);
+        ret = -ENOENT;
+        if ( c == NULL )
+            break;
+        op->cpupool_id = c->cpupool_id;
+        op->sched_id = c->sched.sched_id;
+        op->n_dom = c->n_dom;
+        cpumask_to_xenctl_cpumap(&(op->cpumap), &(c->cpu_valid));
+        ret = 0;
+    }
+    break;
+
+    case XEN_SYSCTL_CPUPOOL_OP_ADDCPU:
+    {
+        unsigned cpu;
+
+        cpu = op->cpu;
+        printk(XENLOG_DEBUG "cpupool_assign_cpu(pool=%d,cpu=%d)\n",
+            op->cpupool_id, cpu);
+        spin_lock(&cpupool_lock);
+        if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY )
+            cpu = first_cpu(cpupool_free_cpus);
+        ret = -EINVAL;
+        if ( cpu >= NR_CPUS )
+            goto addcpu_out;
+        ret = -EBUSY;
+        if ( !cpu_isset(cpu, cpupool_free_cpus) )
+            goto addcpu_out;
+        c = cpupool_find_by_id(op->cpupool_id, 0);
+        ret = -ENOENT;
+        if ( c == NULL )
+            goto addcpu_out;
+        ret = cpupool_assign_cpu_locked(c, cpu);
+addcpu_out:
+        spin_unlock(&cpupool_lock);
+        printk(XENLOG_DEBUG "cpupool_assign_cpu(pool=%d,cpu=%d) ret %d\n",
+            op->cpupool_id, cpu, ret);
+    }
+    break;
+
+    case XEN_SYSCTL_CPUPOOL_OP_RMCPU:
+    {
+        unsigned cpu;
+
+        spin_lock(&cpupool_lock);
+        c = cpupool_find_by_id(op->cpupool_id, 0);
+        spin_unlock(&cpupool_lock);
+        ret = -ENOENT;
+        if ( c == NULL )
+            break;
+        cpu = op->cpu;
+        if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY )
+            cpu = last_cpu(c->cpu_valid);
+        ret = -EINVAL;
+        if ( cpu >= NR_CPUS )
+            break;
+        /* caution: cpupool_unassign_cpu uses continue_hypercall_on_cpu and
+         * will continue after the local return
+         */
+        ret = cpupool_unassign_cpu(c, cpu);
+    }
+    break;
+
+    case XEN_SYSCTL_CPUPOOL_OP_MOVEDOMAIN:
+    {
+        struct domain *d;
+
+        ret = -EINVAL;
+        if ( op->domid == 0 )
+            break;
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(op->domid);
+        if ( d == NULL )
+            break;
+        if ( d->cpupool == NULL )
+        {
+            ret = -EINVAL;
+            rcu_unlock_domain(d);
+            break;
+        }
+        if ( op->cpupool_id == d->cpupool->cpupool_id )
+        {
+            ret = 0;
+            rcu_unlock_domain(d);
+            break;
+        }
+        printk(XENLOG_DEBUG "cpupool move_domain(dom=%d)->pool=%d\n",
+            d->domain_id, op->cpupool_id);
+        ret = -ENOENT;
+        spin_lock(&cpupool_lock);
+        c = cpupool_find_by_id(op->cpupool_id, 1);
+        if ( (c != NULL) && cpus_weight(c->cpu_valid) )
+        {
+            d->cpupool->n_dom--;
+            ret = sched_move_domain(d, c);
+            if ( ret )
+                d->cpupool->n_dom++;
+            else
+                c->n_dom++;
+        }
+        spin_unlock(&cpupool_lock);
+        printk(XENLOG_DEBUG "cpupool move_domain(dom=%d)->pool=%d ret %d\n",
+            d->domain_id, op->cpupool_id, ret);
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_SYSCTL_CPUPOOL_OP_FREEINFO:
+    {
+        cpumask_to_xenctl_cpumap(&(op->cpumap),
+            &cpupool_free_cpus);
+        ret = 0;
+    }
+    break;
+
+    default:
+        ret = -ENOSYS;
+
+    }
+
+    return ret;
+}
+
+void schedule_dump(struct cpupool *c);
+
+void dump_runq(unsigned char key)
+{
+    unsigned long    flags;
+    s_time_t         now = NOW();
+    struct cpupool **c;
+
+    spin_lock(&cpupool_lock);
+    local_irq_save(flags);
+
+    printk("NOW=0x%08X%08X\n",  (u32)(now>>32), (u32)now);
+
+    printk("Idle cpupool:\n");
+    schedule_dump(NULL);
+
+    for_each_cpupool(c)
+    {
+        printk("Cpupool %d:\n", (*c)->cpupool_id);
+        schedule_dump(*c);
+    }
+
+    local_irq_restore(flags);
+    spin_unlock(&cpupool_lock);
+}
+
+static int __init cpupool_init(void)
+{
+    cpupool_free_cpus = cpu_online_map;
+    cpupool_list = NULL;
+    return 0;
+}
+__initcall(cpupool_init);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -209,7 +209,7 @@ static void __init parse_extra_guest_irq
 custom_param("extra_guest_irqs", parse_extra_guest_irqs);
 
 struct domain *domain_create(
-    domid_t domid, unsigned int domcr_flags, ssidref_t ssidref)
+    domid_t domid, int poolid, unsigned int domcr_flags, ssidref_t ssidref)
 {
     struct domain *d, **pd;
     enum { INIT_xsm = 1u<<0, INIT_rangeset = 1u<<1, INIT_evtchn = 1u<<2,
@@ -292,6 +292,9 @@ struct domain *domain_create(
         goto fail;
     init_status |= INIT_arch;
 
+    if ( cpupool_add_domain(d, poolid) != 0 )
+        goto fail;
+
     if ( sched_init_domain(d) != 0 )
         goto fail;
 
@@ -603,6 +606,8 @@ static void complete_domain_destroy(stru
 
     rangeset_domain_destroy(d);
 
+    cpupool_rm_domain(d);
+
     sched_destroy_domain(d);
 
     /* Free page used by xen oprofile buffer. */
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -11,6 +11,7 @@
 #include <xen/lib.h>
 #include <xen/mm.h>
 #include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/domain.h>
 #include <xen/event.h>
 #include <xen/domain_page.h>
@@ -141,10 +142,12 @@ void getdomaininfo(struct domain *d, str
     info->shared_info_frame = mfn_to_gmfn(d, __pa(d->shared_info)>>PAGE_SHIFT);
     BUG_ON(SHARED_M2P(info->shared_info_frame));
 
+    info->cpupool = d->cpupool ? d->cpupool->cpupool_id : CPUPOOLID_NONE;
+
     memcpy(info->handle, d->handle, sizeof(xen_domain_handle_t));
 }
 
-static unsigned int default_vcpu0_location(void)
+static unsigned int default_vcpu0_location(cpumask_t *online)
 {
     struct domain *d;
     struct vcpu   *v;
@@ -174,7 +177,7 @@ static unsigned int default_vcpu0_locati
     if ( cpus_weight(per_cpu(cpu_sibling_map, 0)) > 1 )
         cpu = next_cpu(cpu, per_cpu(cpu_sibling_map, 0));
     cpu_exclude_map = per_cpu(cpu_sibling_map, 0);
-    for_each_online_cpu ( i )
+    for_each_cpu_mask(i, *online)
     {
         if ( cpu_isset(i, cpu_exclude_map) )
             continue;
@@ -389,6 +392,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
         domid_t        dom;
         static domid_t rover = 0;
         unsigned int domcr_flags;
+        int            pool = 0;
 
         ret = -EINVAL;
         if ( supervisor_mode_kernel ||
@@ -432,7 +436,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
             domcr_flags |= DOMCRF_oos_off;
 
         ret = -ENOMEM;
-        d = domain_create(dom, domcr_flags, op->u.createdomain.ssidref);
+        d = domain_create(dom, pool, domcr_flags, op->u.createdomain.ssidref);
         if ( d == NULL )
             break;
 
@@ -451,6 +455,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
     {
         struct domain *d;
         unsigned int i, max = op->u.max_vcpus.max, cpu;
+        cpumask_t *online;
 
         ret = -ESRCH;
         if ( (d = rcu_lock_domain_by_id(op->domain)) == NULL )
@@ -499,6 +504,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
             goto maxvcpu_out;
 
         ret = -ENOMEM;
+        online = (d->cpupool == NULL) ? &cpu_online_map : &d->cpupool->cpu_valid;
         if ( max > d->max_vcpus )
         {
             struct vcpu **vcpus;
@@ -522,8 +528,8 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
                 continue;
 
             cpu = (i == 0) ?
-                default_vcpu0_location() :
-                cycle_cpu(d->vcpu[i-1]->processor, cpu_online_map);
+                default_vcpu0_location(online) :
+                cycle_cpu(d->vcpu[i-1]->processor, *online);
 
             if ( alloc_vcpu(d, i, cpu) == NULL )
                 goto maxvcpu_out;
--- a/xen/common/kexec.c
+++ b/xen/common/kexec.c
@@ -235,7 +235,7 @@ void kexec_crash(void)
     BUG();
 }
 
-static long kexec_reboot(void *_image)
+static long kexec_reboot(void *hdl, void *_image)
 {
     xen_kexec_image_t *image = _image;
 
@@ -584,7 +584,7 @@ static int kexec_exec(XEN_GUEST_HANDLE(v
     {
     case KEXEC_TYPE_DEFAULT:
         image = &kexec_image[base + pos];
-        ret = continue_hypercall_on_cpu(0, kexec_reboot, image);
+        ret = continue_hypercall_on_cpu(0, NULL, kexec_reboot, image);
         break;
     case KEXEC_TYPE_CRASH:
         kexec_crash(); /* Does not return */
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -70,11 +70,15 @@
 /*
  * Useful macros
  */
+#define CSCHED_PRIV(_ops)   \
+    ((struct csched_private *)((_ops)->sched_data))
 #define CSCHED_PCPU(_c)     \
     ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv)
 #define CSCHED_VCPU(_vcpu)  ((struct csched_vcpu *) (_vcpu)->sched_priv)
 #define CSCHED_DOM(_dom)    ((struct csched_dom *) (_dom)->sched_priv)
 #define RUNQ(_cpu)          (&(CSCHED_PCPU(_cpu)->runq))
+#define CSCHED_CPUONLINE(_pool)    \
+    (((_pool) == NULL) ? &cpupool_free_cpus : &(_pool)->cpu_valid)
 
 
 /*
@@ -160,10 +164,12 @@ struct csched_private {
     struct timer  master_ticker;
     unsigned int master;
     cpumask_t idlers;
+    cpumask_t cpus;
     uint32_t weight;
     uint32_t credit;
     int credit_balance;
     uint32_t runq_sort;
+    int ticker_active;
 };
 
 
@@ -171,8 +177,10 @@ struct csched_private {
  * Global variables
  */
 static struct csched_private csched_priv;
+static struct csched_private *csched_priv0 = NULL;
 
 static void csched_tick(void *_cpu);
+static void csched_acct(void *dummy);
 
 static inline int
 __vcpu_on_runq(struct csched_vcpu *svc)
@@ -233,6 +241,7 @@ __runq_tickle(unsigned int cpu, struct c
 {
     struct csched_vcpu * const cur =
         CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
+    struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu));
     cpumask_t mask;
 
     ASSERT(cur);
@@ -259,14 +268,14 @@ __runq_tickle(unsigned int cpu, struct c
      */
     if ( cur->pri > CSCHED_PRI_IDLE )
     {
-        if ( cpus_empty(csched_priv.idlers) )
+        if ( cpus_empty(prv->idlers) )
         {
             CSCHED_STAT_CRANK(tickle_idlers_none);
         }
         else
         {
             CSCHED_STAT_CRANK(tickle_idlers_some);
-            cpus_or(mask, mask, csched_priv.idlers);
+            cpus_or(mask, mask, prv->idlers);
             cpus_and(mask, mask, new->vcpu->cpu_affinity);
         }
     }
@@ -276,40 +285,80 @@ __runq_tickle(unsigned int cpu, struct c
         cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
 }
 
-static int
-csched_pcpu_init(int cpu)
+static void
+csched_free_pdata(struct scheduler *ops, void *pcpu, int cpu)
+{
+    struct csched_private *prv = CSCHED_PRIV(ops);
+    struct csched_pcpu *spc = pcpu;
+    unsigned long flags;
+
+    if ( spc == NULL )
+        return;
+
+    spin_lock_irqsave(&prv->lock, flags);
+
+    prv->credit -= CSCHED_CREDITS_PER_ACCT;
+    prv->ncpus--;
+    cpu_clear(cpu, prv->idlers);
+    cpu_clear(cpu, prv->cpus);
+    if ( (prv->master == cpu) && (prv->ncpus > 0) )
+    {
+        prv->master = first_cpu(prv->cpus);
+        migrate_timer(&prv->master_ticker, prv->master);
+    }
+    kill_timer(&spc->ticker);
+    if ( prv->ncpus == 0 )
+        kill_timer(&prv->master_ticker);
+
+    spin_unlock_irqrestore(&prv->lock, flags);
+
+    xfree(spc);
+}
+
+static void *
+csched_alloc_pdata(struct scheduler *ops, int cpu)
 {
     struct csched_pcpu *spc;
+    struct csched_private *prv = CSCHED_PRIV(ops);
     unsigned long flags;
 
     /* Allocate per-PCPU info */
     spc = xmalloc(struct csched_pcpu);
     if ( spc == NULL )
-        return -1;
+        return NULL;
     memset(spc, 0, sizeof(*spc));
 
-    spin_lock_irqsave(&csched_priv.lock, flags);
+    spin_lock_irqsave(&prv->lock, flags);
 
     /* Initialize/update system-wide config */
-    csched_priv.credit += CSCHED_CREDITS_PER_ACCT;
-    if ( csched_priv.ncpus <= cpu )
-        csched_priv.ncpus = cpu + 1;
-    if ( csched_priv.master >= csched_priv.ncpus )
-        csched_priv.master = cpu;
+    prv->credit += CSCHED_CREDITS_PER_ACCT;
+    prv->ncpus++;
+    cpu_set(cpu, prv->cpus);
+    if ( (prv->ncpus == 1) && (prv != csched_priv0) )
+    {
+        prv->master = cpu;
+        init_timer( &prv->master_ticker, csched_acct, prv, cpu);
+        prv->ticker_active = 2;
+    }
 
     init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu);
+
+    if ( prv == csched_priv0 )
+        prv->master = first_cpu(prv->cpus);
+
     INIT_LIST_HEAD(&spc->runq);
-    spc->runq_sort_last = csched_priv.runq_sort;
+    spc->runq_sort_last = prv->runq_sort;
     spc->idle_bias = NR_CPUS - 1;
-    per_cpu(schedule_data, cpu).sched_priv = spc;
+    if ( per_cpu(schedule_data, cpu).sched_priv == NULL )
+        per_cpu(schedule_data, cpu).sched_priv = spc;
 
     /* Start off idling... */
     BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr));
-    cpu_set(cpu, csched_priv.idlers);
+    cpu_set(cpu, prv->idlers);
 
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
+    spin_unlock_irqrestore(&prv->lock, flags);
 
-    return 0;
+    return spc;
 }
 
 #ifndef NDEBUG
@@ -382,17 +431,19 @@ __csched_vcpu_is_migrateable(struct vcpu
 }
 
 static int
-_csched_cpu_pick(struct vcpu *vc, bool_t commit)
+_csched_cpu_pick(struct scheduler *ops, struct vcpu *vc, bool_t commit)
 {
     cpumask_t cpus;
     cpumask_t idlers;
+    cpumask_t *online;
     int cpu;
 
     /*
      * Pick from online CPUs in VCPU's affinity mask, giving a
      * preference to its current processor if it's in there.
      */
-    cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
+    online = CSCHED_CPUONLINE(vc->domain->cpupool);
+    cpus_and(cpus, *online, vc->cpu_affinity);
     cpu = cpu_isset(vc->processor, cpus)
             ? vc->processor
             : cycle_cpu(vc->processor, cpus);
@@ -410,7 +461,7 @@ _csched_cpu_pick(struct vcpu *vc, bool_t
      * like run two VCPUs on co-hyperthreads while there are idle cores
      * or sockets.
      */
-    cpus_and(idlers, cpu_online_map, csched_priv.idlers);
+    cpus_and(idlers, cpu_online_map, CSCHED_PRIV(ops)->idlers);
     cpu_set(cpu, idlers);
     cpus_and(cpus, cpus, idlers);
     cpu_clear(cpu, cpus);
@@ -456,18 +507,18 @@ _csched_cpu_pick(struct vcpu *vc, bool_t
 }
 
 static int
-csched_cpu_pick(struct vcpu *vc)
+csched_cpu_pick(struct scheduler *ops, struct vcpu *vc)
 {
-    return _csched_cpu_pick(vc, 1);
+    return _csched_cpu_pick(ops, vc, 1);
 }
 
 static inline void
-__csched_vcpu_acct_start(struct csched_vcpu *svc)
+__csched_vcpu_acct_start(struct csched_private *prv, struct csched_vcpu *svc)
 {
     struct csched_dom * const sdom = svc->sdom;
     unsigned long flags;
 
-    spin_lock_irqsave(&csched_priv.lock, flags);
+    spin_lock_irqsave(&prv->lock, flags);
 
     if ( list_empty(&svc->active_vcpu_elem) )
     {
@@ -478,16 +529,17 @@ __csched_vcpu_acct_start(struct csched_v
         list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
         if ( list_empty(&sdom->active_sdom_elem) )
         {
-            list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
-            csched_priv.weight += sdom->weight;
+            list_add(&sdom->active_sdom_elem, &prv->active_sdom);
+            prv->weight += sdom->weight;
         }
     }
 
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
+    spin_unlock_irqrestore(&prv->lock, flags);
 }
 
 static inline void
-__csched_vcpu_acct_stop_locked(struct csched_vcpu *svc)
+__csched_vcpu_acct_stop_locked(struct csched_private *prv,
+    struct csched_vcpu *svc)
 {
     struct csched_dom * const sdom = svc->sdom;
 
@@ -500,16 +552,17 @@ __csched_vcpu_acct_stop_locked(struct cs
     list_del_init(&svc->active_vcpu_elem);
     if ( list_empty(&sdom->active_vcpu) )
     {
-        BUG_ON( csched_priv.weight < sdom->weight );
+        BUG_ON( prv->weight < sdom->weight );
         list_del_init(&sdom->active_sdom_elem);
-        csched_priv.weight -= sdom->weight;
+        prv->weight -= sdom->weight;
     }
 }
 
 static void
-csched_vcpu_acct(unsigned int cpu)
+csched_vcpu_acct(struct csched_private *prv, unsigned int cpu)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(current);
+    struct scheduler *ops = per_cpu(scheduler, cpu);
 
     ASSERT( current->processor == cpu );
     ASSERT( svc->sdom != NULL );
@@ -538,9 +591,9 @@ csched_vcpu_acct(unsigned int cpu)
      */
     if ( list_empty(&svc->active_vcpu_elem) )
     {
-        __csched_vcpu_acct_start(svc);
+        __csched_vcpu_acct_start(prv, svc);
     }
-    else if ( _csched_cpu_pick(current, 0) != cpu )
+    else if ( _csched_cpu_pick(ops, current, 0) != cpu )
     {
         CSCHED_VCPU_STAT_CRANK(svc, migrate_r);
         CSCHED_STAT_CRANK(migrate_running);
@@ -549,66 +602,75 @@ csched_vcpu_acct(unsigned int cpu)
     }
 }
 
-static int
-csched_vcpu_init(struct vcpu *vc)
+static void *
+csched_alloc_vdata(struct scheduler *ops, struct vcpu *vc, void *dd)
 {
-    struct domain * const dom = vc->domain;
-    struct csched_dom *sdom = CSCHED_DOM(dom);
     struct csched_vcpu *svc;
 
-    CSCHED_STAT_CRANK(vcpu_init);
-
     /* Allocate per-VCPU info */
     svc = xmalloc(struct csched_vcpu);
     if ( svc == NULL )
-        return -1;
+        return NULL;
     memset(svc, 0, sizeof(*svc));
 
     INIT_LIST_HEAD(&svc->runq_elem);
     INIT_LIST_HEAD(&svc->active_vcpu_elem);
-    svc->sdom = sdom;
+    svc->sdom = dd;
     svc->vcpu = vc;
     atomic_set(&svc->credit, 0);
     svc->flags = 0U;
-    svc->pri = is_idle_domain(dom) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
+    svc->pri = is_idle_domain(vc->domain) ?
+        CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
     CSCHED_VCPU_STATS_RESET(svc);
-    vc->sched_priv = svc;
+    CSCHED_STAT_CRANK(vcpu_init);
+    return svc;
+}
 
-    /* Allocate per-PCPU info */
-    if ( unlikely(!CSCHED_PCPU(vc->processor)) )
-    {
-        if ( csched_pcpu_init(vc->processor) != 0 )
-            return -1;
-    }
+static void
+csched_vcpu_insert(struct scheduler *ops, struct vcpu *vc)
+{
+    struct csched_vcpu *svc = vc->sched_priv;
 
-    CSCHED_VCPU_CHECK(vc);
-    return 0;
+    if ( !__vcpu_on_runq(svc) && vcpu_runnable(vc) && !vc->is_running )
+        __runq_insert(vc->processor, svc);
 }
 
 static void
-csched_vcpu_destroy(struct vcpu *vc)
+csched_free_vdata(struct scheduler *ops, void *priv)
 {
-    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
-    struct csched_dom * const sdom = svc->sdom;
+    struct csched_private *prv = CSCHED_PRIV(ops);
+    struct csched_vcpu *svc = priv;
     unsigned long flags;
 
-    CSCHED_STAT_CRANK(vcpu_destroy);
-
-    BUG_ON( sdom == NULL );
-    BUG_ON( !list_empty(&svc->runq_elem) );
+    if ( __vcpu_on_runq(svc) )
+        __runq_remove(svc);
 
-    spin_lock_irqsave(&csched_priv.lock, flags);
+    spin_lock_irqsave(&(prv->lock), flags);
 
     if ( !list_empty(&svc->active_vcpu_elem) )
-        __csched_vcpu_acct_stop_locked(svc);
+        __csched_vcpu_acct_stop_locked(prv, svc);
 
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
+    spin_unlock_irqrestore(&(prv->lock), flags);
 
     xfree(svc);
 }
 
 static void
-csched_vcpu_sleep(struct vcpu *vc)
+csched_vcpu_destroy(struct scheduler *ops, struct vcpu *vc)
+{
+    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+    struct csched_dom * const sdom = svc->sdom;
+
+    CSCHED_STAT_CRANK(vcpu_destroy);
+
+    BUG_ON( sdom == NULL );
+    BUG_ON( !list_empty(&svc->runq_elem) );
+
+    csched_free_vdata(ops, svc);
+}
+
+static void
+csched_vcpu_sleep(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
 
@@ -623,7 +685,7 @@ csched_vcpu_sleep(struct vcpu *vc)
 }
 
 static void
-csched_vcpu_wake(struct vcpu *vc)
+csched_vcpu_wake(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
     const unsigned int cpu = vc->processor;
@@ -679,10 +741,12 @@ csched_vcpu_wake(struct vcpu *vc)
 
 static int
 csched_dom_cntl(
+    struct scheduler *ops,
     struct domain *d,
     struct xen_domctl_scheduler_op *op)
 {
     struct csched_dom * const sdom = CSCHED_DOM(d);
+    struct csched_private *prv = CSCHED_PRIV(ops);
     unsigned long flags;
 
     if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
@@ -694,14 +758,14 @@ csched_dom_cntl(
     {
         ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
 
-        spin_lock_irqsave(&csched_priv.lock, flags);
+        spin_lock_irqsave(&prv->lock, flags);
 
         if ( op->u.credit.weight != 0 )
         {
             if ( !list_empty(&sdom->active_sdom_elem) )
             {
-                csched_priv.weight -= sdom->weight;
-                csched_priv.weight += op->u.credit.weight;
+                prv->weight -= sdom->weight;
+                prv->weight += op->u.credit.weight;
             }
             sdom->weight = op->u.credit.weight;
         }
@@ -709,25 +773,20 @@ csched_dom_cntl(
         if ( op->u.credit.cap != (uint16_t)~0U )
             sdom->cap = op->u.credit.cap;
 
-        spin_unlock_irqrestore(&csched_priv.lock, flags);
+        spin_unlock_irqrestore(&prv->lock, flags);
     }
 
     return 0;
 }
 
-static int
-csched_dom_init(struct domain *dom)
+static void *
+csched_alloc_domdata(struct scheduler *ops, struct domain *dom)
 {
     struct csched_dom *sdom;
 
-    CSCHED_STAT_CRANK(dom_init);
-
-    if ( is_idle_domain(dom) )
-        return 0;
-
     sdom = xmalloc(struct csched_dom);
     if ( sdom == NULL )
-        return -ENOMEM;
+        return NULL;
     memset(sdom, 0, sizeof(*sdom));
 
     /* Initialize credit and weight */
@@ -737,16 +796,40 @@ csched_dom_init(struct domain *dom)
     sdom->dom = dom;
     sdom->weight = CSCHED_DEFAULT_WEIGHT;
     sdom->cap = 0U;
+
+    return (void *)sdom;
+}
+
+static int
+csched_dom_init(struct scheduler *ops, struct domain *dom)
+{
+    struct csched_dom *sdom;
+
+    CSCHED_STAT_CRANK(dom_init);
+
+    if ( is_idle_domain(dom) )
+        return 0;
+
+    sdom = csched_alloc_domdata(ops, dom);
+    if ( sdom == NULL )
+        return -ENOMEM;
+
     dom->sched_priv = sdom;
 
     return 0;
 }
 
 static void
-csched_dom_destroy(struct domain *dom)
+csched_free_domdata(struct scheduler *ops, void *data)
+{
+    xfree(data);
+}
+
+static void
+csched_dom_destroy(struct scheduler *ops, struct domain *dom)
 {
     CSCHED_STAT_CRANK(dom_destroy);
-    xfree(CSCHED_DOM(dom));
+    csched_free_domdata(ops, CSCHED_DOM(dom));
 }
 
 /*
@@ -757,7 +840,7 @@ csched_dom_destroy(struct domain *dom)
  * remember the last UNDER to make the move up operation O(1).
  */
 static void
-csched_runq_sort(unsigned int cpu)
+csched_runq_sort(struct csched_private *prv, unsigned int cpu)
 {
     struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
     struct list_head *runq, *elem, *next, *last_under;
@@ -765,7 +848,7 @@ csched_runq_sort(unsigned int cpu)
     unsigned long flags;
     int sort_epoch;
 
-    sort_epoch = csched_priv.runq_sort;
+    sort_epoch = prv->runq_sort;
     if ( sort_epoch == spc->runq_sort_last )
         return;
 
@@ -802,6 +885,7 @@ csched_runq_sort(unsigned int cpu)
 static void
 csched_acct(void* dummy)
 {
+    struct csched_private *prv = dummy;
     unsigned long flags;
     struct list_head *iter_vcpu, *next_vcpu;
     struct list_head *iter_sdom, *next_sdom;
@@ -818,22 +902,22 @@ csched_acct(void* dummy)
     int credit;
 
 
-    spin_lock_irqsave(&csched_priv.lock, flags);
+    spin_lock_irqsave(&prv->lock, flags);
 
-    weight_total = csched_priv.weight;
-    credit_total = csched_priv.credit;
+    weight_total = prv->weight;
+    credit_total = prv->credit;
 
     /* Converge balance towards 0 when it drops negative */
-    if ( csched_priv.credit_balance < 0 )
+    if ( prv->credit_balance < 0 )
     {
-        credit_total -= csched_priv.credit_balance;
+        credit_total -= prv->credit_balance;
         CSCHED_STAT_CRANK(acct_balance);
     }
 
     if ( unlikely(weight_total == 0) )
     {
-        csched_priv.credit_balance = 0;
-        spin_unlock_irqrestore(&csched_priv.lock, flags);
+        prv->credit_balance = 0;
+        spin_unlock_irqrestore(&prv->lock, flags);
         CSCHED_STAT_CRANK(acct_no_work);
         goto out;
     }
@@ -845,7 +929,7 @@ csched_acct(void* dummy)
     credit_xtra = 0;
     credit_cap = 0U;
 
-    list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
+    list_for_each_safe( iter_sdom, next_sdom, &prv->active_sdom )
     {
         sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
 
@@ -865,9 +949,9 @@ csched_acct(void* dummy)
          * only when the system-wide credit balance is negative.
          */
         credit_peak = sdom->active_vcpu_count * CSCHED_CREDITS_PER_ACCT;
-        if ( csched_priv.credit_balance < 0 )
+        if ( prv->credit_balance < 0 )
         {
-            credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
+            credit_peak += ( ( -prv->credit_balance * sdom->weight) +
                              (weight_total - 1)
                            ) / weight_total;
         }
@@ -909,7 +993,7 @@ csched_acct(void* dummy)
                  */
                 CSCHED_STAT_CRANK(acct_reorder);
                 list_del(&sdom->active_sdom_elem);
-                list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
+                list_add(&sdom->active_sdom_elem, &prv->active_sdom);
             }
 
             credit_fair = credit_peak;
@@ -975,7 +1059,7 @@ csched_acct(void* dummy)
                 /* Upper bound on credits means VCPU stops earning */
                 if ( credit > CSCHED_CREDITS_PER_TSLICE )
                 {
-                    __csched_vcpu_acct_stop_locked(svc);
+                    __csched_vcpu_acct_stop_locked(prv, svc);
                     credit = 0;
                     atomic_set(&svc->credit, credit);
                 }
@@ -987,15 +1071,15 @@ csched_acct(void* dummy)
         }
     }
 
-    csched_priv.credit_balance = credit_balance;
+    prv->credit_balance = credit_balance;
 
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
+    spin_unlock_irqrestore(&prv->lock, flags);
 
     /* Inform each CPU that its runq needs to be sorted */
-    csched_priv.runq_sort++;
+    prv->runq_sort++;
 
 out:
-    set_timer( &csched_priv.master_ticker, NOW() +
+    set_timer( &prv->master_ticker, NOW() +
             MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT );
 }
 
@@ -1004,6 +1088,7 @@ csched_tick(void *_cpu)
 {
     unsigned int cpu = (unsigned long)_cpu;
     struct csched_pcpu *spc = CSCHED_PCPU(cpu);
+    struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu));
 
     spc->tick++;
 
@@ -1011,7 +1096,7 @@ csched_tick(void *_cpu)
      * Accounting for running VCPU
      */
     if ( !is_idle_vcpu(current) )
-        csched_vcpu_acct(cpu);
+        csched_vcpu_acct(prv, cpu);
 
     /*
      * Check if runq needs to be sorted
@@ -1020,7 +1105,7 @@ csched_tick(void *_cpu)
      * modified priorities. This is a special O(n) sort and runs at most
      * once per accounting period (currently 30 milliseconds).
      */
-    csched_runq_sort(cpu);
+    csched_runq_sort(prv, cpu);
 
     set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
 }
@@ -1073,16 +1158,19 @@ csched_runq_steal(int peer_cpu, int cpu,
 }
 
 static struct csched_vcpu *
-csched_load_balance(int cpu, struct csched_vcpu *snext)
+csched_load_balance(struct csched_private *prv, int cpu,
+    struct csched_vcpu *snext)
 {
     struct csched_vcpu *speer;
     cpumask_t workers;
+    cpumask_t *online;
     int peer_cpu;
 
     BUG_ON( cpu != snext->vcpu->processor );
+    online = CSCHED_CPUONLINE(per_cpu(cpupool, cpu));
 
     /* If this CPU is going offline we shouldn't steal work. */
-    if ( unlikely(!cpu_online(cpu)) )
+    if ( unlikely(!cpu_isset(cpu, *online)) )
         goto out;
 
     if ( snext->pri == CSCHED_PRI_IDLE )
@@ -1096,7 +1184,7 @@ csched_load_balance(int cpu, struct csch
      * Peek at non-idling CPUs in the system, starting with our
      * immediate neighbour.
      */
-    cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
+    cpus_andnot(workers, *online, prv->idlers);
     cpu_clear(cpu, workers);
     peer_cpu = cpu;
 
@@ -1138,11 +1226,12 @@ csched_load_balance(int cpu, struct csch
  * fast for the common case.
  */
 static struct task_slice
-csched_schedule(s_time_t now)
+csched_schedule(struct scheduler *ops, s_time_t now)
 {
     const int cpu = smp_processor_id();
     struct list_head * const runq = RUNQ(cpu);
     struct csched_vcpu * const scurr = CSCHED_VCPU(current);
+    struct csched_private *prv = CSCHED_PRIV(ops);
     struct csched_vcpu *snext;
     struct task_slice ret;
 
@@ -1177,7 +1266,7 @@ csched_schedule(s_time_t now)
     if ( snext->pri > CSCHED_PRI_TS_OVER )
         __runq_remove(snext);
     else
-        snext = csched_load_balance(cpu, snext);
+        snext = csched_load_balance(prv, cpu, snext);
 
     /*
      * Update idlers mask if necessary. When we're idling, other CPUs
@@ -1185,12 +1274,12 @@ csched_schedule(s_time_t now)
      */
     if ( snext->pri == CSCHED_PRI_IDLE )
     {
-        if ( !cpu_isset(cpu, csched_priv.idlers) )
-            cpu_set(cpu, csched_priv.idlers);
+        if ( !cpu_isset(cpu, prv->idlers) )
+            cpu_set(cpu, prv->idlers);
     }
-    else if ( cpu_isset(cpu, csched_priv.idlers) )
+    else if ( cpu_isset(cpu, prv->idlers) )
     {
-        cpu_clear(cpu, csched_priv.idlers);
+        cpu_clear(cpu, prv->idlers);
     }
 
     if ( !is_idle_vcpu(snext->vcpu) )
@@ -1237,7 +1326,7 @@ csched_dump_vcpu(struct csched_vcpu *svc
 }
 
 static void
-csched_dump_pcpu(int cpu)
+csched_dump_pcpu(struct scheduler *ops, int cpu)
 {
     struct list_head *runq, *iter;
     struct csched_pcpu *spc;
@@ -1275,9 +1364,10 @@ csched_dump_pcpu(int cpu)
 }
 
 static void
-csched_dump(void)
+csched_dump(struct scheduler *ops)
 {
     struct list_head *iter_sdom, *iter_svc;
+    struct csched_private *prv = CSCHED_PRIV(ops);
     int loop;
 #define idlers_buf keyhandler_scratch
 
@@ -1294,12 +1384,12 @@ csched_dump(void)
            "\tticks per tslice   = %d\n"
            "\tticks per acct     = %d\n"
            "\tmigration delay    = %uus\n",
-           csched_priv.ncpus,
-           csched_priv.master,
-           csched_priv.credit,
-           csched_priv.credit_balance,
-           csched_priv.weight,
-           csched_priv.runq_sort,
+           prv->ncpus,
+           prv->master,
+           prv->credit,
+           prv->credit_balance,
+           prv->weight,
+           prv->runq_sort,
            CSCHED_DEFAULT_WEIGHT,
            CSCHED_MSECS_PER_TICK,
            CSCHED_CREDITS_PER_MSEC,
@@ -1307,12 +1397,12 @@ csched_dump(void)
            CSCHED_TICKS_PER_ACCT,
            vcpu_migration_delay);
 
-    cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), csched_priv.idlers);
+    cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), prv->idlers);
     printk("idlers: %s\n", idlers_buf);
 
     printk("active vcpus:\n");
     loop = 0;
-    list_for_each( iter_sdom, &csched_priv.active_sdom )
+    list_for_each( iter_sdom, &prv->active_sdom )
     {
         struct csched_dom *sdom;
         sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
@@ -1329,18 +1419,30 @@ csched_dump(void)
 #undef idlers_buf
 }
 
-static void
-csched_init(void)
+static int
+csched_init(struct scheduler *ops)
 {
-    spin_lock_init(&csched_priv.lock);
-    INIT_LIST_HEAD(&csched_priv.active_sdom);
-    csched_priv.ncpus = 0;
-    csched_priv.master = UINT_MAX;
-    cpus_clear(csched_priv.idlers);
-    csched_priv.weight = 0U;
-    csched_priv.credit = 0U;
-    csched_priv.credit_balance = 0;
-    csched_priv.runq_sort = 0U;
+    struct csched_private *prv;
+
+    prv = xmalloc(struct csched_private);
+    if ( prv == NULL )
+        return 1;
+    memset(prv, 0, sizeof(*prv));
+    if (csched_priv0 == NULL)
+        csched_priv0 = prv;
+    ops->sched_data = prv;
+    spin_lock_init(&prv->lock);
+    INIT_LIST_HEAD(&prv->active_sdom);
+    prv->ncpus = 0;
+    prv->master = UINT_MAX;
+    cpus_clear(prv->idlers);
+    prv->weight = 0U;
+    prv->credit = 0U;
+    prv->credit_balance = 0;
+    prv->runq_sort = 0U;
+    prv->ticker_active = (csched_priv0 == prv) ? 0 : 1;
+
+    return 0;
 }
 
 /* Tickers cannot be kicked until SMP subsystem is alive. */
@@ -1350,54 +1452,81 @@ static __init int csched_start_tickers(v
     unsigned int cpu;
 
     /* Is the credit scheduler initialised? */
-    if ( csched_priv.ncpus == 0 )
+    if ( (csched_priv0 == NULL) || (csched_priv0->ncpus == 0) )
         return 0;
 
+    csched_priv0->ticker_active = 1;
+
     for_each_online_cpu ( cpu )
     {
         spc = CSCHED_PCPU(cpu);
         set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
     }
 
-    init_timer( &csched_priv.master_ticker, csched_acct, NULL,
-                    csched_priv.master);
+    init_timer( &csched_priv0->master_ticker, csched_acct, csched_priv0,
+                    csched_priv0->master);
 
-    set_timer( &csched_priv.master_ticker, NOW() +
+    set_timer( &csched_priv0->master_ticker, NOW() +
             MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT );
 
     return 0;
 }
 __initcall(csched_start_tickers);
 
-static void csched_tick_suspend(void)
+static void
+csched_deinit(struct scheduler *ops)
+{
+    struct csched_private *prv;
+
+    prv = CSCHED_PRIV(ops);
+    if ( prv != NULL )
+        xfree(prv);
+}
+
+static void csched_tick_suspend(struct scheduler *ops, unsigned int cpu)
 {
     struct csched_pcpu *spc;
 
-    spc = CSCHED_PCPU(smp_processor_id());
+    spc = CSCHED_PCPU(cpu);
 
     stop_timer(&spc->ticker);
 }
 
-static void csched_tick_resume(void)
+static void csched_tick_resume(struct scheduler *ops, unsigned int cpu)
 {
     struct csched_pcpu *spc;
     uint64_t now = NOW();
+    struct csched_private *prv;
+
+    prv = CSCHED_PRIV(ops);
+    if ( !prv->ticker_active )
+        return;
 
-    spc = CSCHED_PCPU(smp_processor_id());
+
+    spc = CSCHED_PCPU(cpu);
 
     set_timer(&spc->ticker, now + MILLISECS(CSCHED_MSECS_PER_TICK)
             - now % MILLISECS(CSCHED_MSECS_PER_TICK) );
+
+    if ( (prv->ticker_active == 2) && (prv->master == cpu) )
+    {
+        set_timer( &prv->master_ticker, now +
+            MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT -
+            now % MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT);
+        prv->ticker_active = 1;
+    }
 }
 
-const struct scheduler sched_credit_def = {
+struct scheduler sched_credit_def = {
     .name           = "SMP Credit Scheduler",
     .opt_name       = "credit",
     .sched_id       = XEN_SCHEDULER_CREDIT,
+    .sched_data     = &csched_priv,
 
     .init_domain    = csched_dom_init,
     .destroy_domain = csched_dom_destroy,
 
-    .init_vcpu      = csched_vcpu_init,
+    .insert_vcpu    = csched_vcpu_insert,
     .destroy_vcpu   = csched_vcpu_destroy,
 
     .sleep          = csched_vcpu_sleep,
@@ -1411,6 +1540,13 @@ const struct scheduler sched_credit_def 
     .dump_cpu_state = csched_dump_pcpu,
     .dump_settings  = csched_dump,
     .init           = csched_init,
+    .deinit         = csched_deinit,
+    .alloc_vdata    = csched_alloc_vdata,
+    .free_vdata     = csched_free_vdata,
+    .alloc_pdata    = csched_alloc_pdata,
+    .free_pdata     = csched_free_pdata,
+    .alloc_domdata  = csched_alloc_domdata,
+    .free_domdata   = csched_free_domdata,
 
     .tick_suspend   = csched_tick_suspend,
     .tick_resume    = csched_tick_resume,
--- a/xen/common/sched_sedf.c
+++ b/xen/common/sched_sedf.c
@@ -21,6 +21,9 @@
             printk(_a );                        \
     } while ( 0 )
 
+#define SEDF_CPUONLINE(_pool)                                             \
+    (((_pool) == NULL) ? &cpupool_free_cpus : &(_pool)->cpu_valid)
+
 #ifndef NDEBUG
 #define SEDF_STATS
 #define CHECK(_p)                                           \
@@ -132,7 +135,7 @@ struct sedf_cpu_info {
 #define sedf_runnable(edom)  (!(EDOM_INFO(edom)->status & SEDF_ASLEEP))
 
 
-static void sedf_dump_cpu_state(int i);
+static void sedf_dump_cpu_state(struct scheduler *ops, int i);
 
 static inline int extraq_on(struct vcpu *d, int i)
 {
@@ -329,30 +332,17 @@ static inline void __add_to_runqueue_sor
 }
 
 
-static int sedf_init_vcpu(struct vcpu *v)
+static void *sedf_alloc_vdata(struct scheduler *ops, struct vcpu *v, void *dd)
 {
     struct sedf_vcpu_info *inf;
 
-    if ( (v->sched_priv = xmalloc(struct sedf_vcpu_info)) == NULL )
-        return -1;
-    memset(v->sched_priv, 0, sizeof(struct sedf_vcpu_info));
+    inf = xmalloc(struct sedf_vcpu_info);
+    if ( inf == NULL )
+        return NULL;
 
-    inf = EDOM_INFO(v);
+    memset(inf, 0, sizeof(struct sedf_vcpu_info));
     inf->vcpu = v;
- 
-    /* Allocate per-CPU context if this is the first domain to be added. */
-    if ( unlikely(per_cpu(schedule_data, v->processor).sched_priv == NULL) )
-    {
-        per_cpu(schedule_data, v->processor).sched_priv = 
-            xmalloc(struct sedf_cpu_info);
-        BUG_ON(per_cpu(schedule_data, v->processor).sched_priv == NULL);
-        memset(CPU_INFO(v->processor), 0, sizeof(*CPU_INFO(v->processor)));
-        INIT_LIST_HEAD(WAITQ(v->processor));
-        INIT_LIST_HEAD(RUNQ(v->processor));
-        INIT_LIST_HEAD(EXTRAQ(v->processor,EXTRA_PEN_Q));
-        INIT_LIST_HEAD(EXTRAQ(v->processor,EXTRA_UTIL_Q));
-    }
-       
+
     /* Every VCPU gets an equal share of extratime by default. */
     inf->deadl_abs   = 0;
     inf->latency     = 0;
@@ -383,39 +373,88 @@ static int sedf_init_vcpu(struct vcpu *v
     }
     else
     {
-        EDOM_INFO(v)->deadl_abs = 0;
-        EDOM_INFO(v)->status &= ~SEDF_ASLEEP;
+        inf->deadl_abs = 0;
+        inf->status &= ~SEDF_ASLEEP;
     }
 
-    return 0;
+    return inf;
+}
+
+static void *
+sedf_alloc_pdata(struct scheduler *ops, int cpu)
+{
+    struct sedf_cpu_info *spc;
+
+    spc = xmalloc(struct sedf_cpu_info);
+    BUG_ON(spc == NULL);
+    memset(spc, 0, sizeof(*spc));
+    INIT_LIST_HEAD(&spc->waitq);
+    INIT_LIST_HEAD(&spc->runnableq);
+    INIT_LIST_HEAD(&spc->extraq[EXTRA_PEN_Q]);
+    INIT_LIST_HEAD(&spc->extraq[EXTRA_UTIL_Q]);
+
+    return (void *)spc;
+}
+
+static void
+sedf_free_pdata(struct scheduler *ops, void *spc, int cpu)
+{
+    if ( spc == NULL )
+        return;
+
+    xfree(spc);
+}
+
+static void sedf_free_vdata(struct scheduler *ops, void *priv)
+{
+    xfree(priv);
 }
 
-static void sedf_destroy_vcpu(struct vcpu *v)
+static void sedf_destroy_vcpu(struct scheduler *ops, struct vcpu *v)
 {
-    xfree(v->sched_priv);
+    sedf_free_vdata(ops, v->sched_priv);
 }
 
-static int sedf_init_domain(struct domain *d)
+static void *
+sedf_alloc_domdata(struct scheduler *ops, struct domain *d)
 {
-    d->sched_priv = xmalloc(struct sedf_dom_info);
+    void *mem;
+
+    mem = xmalloc(struct sedf_dom_info);
+    if ( mem == NULL )
+        return NULL;
+
+    memset(mem, 0, sizeof(struct sedf_dom_info));
+
+    return mem;
+}
+
+static int sedf_init_domain(struct scheduler *ops, struct domain *d)
+{
+    d->sched_priv = sedf_alloc_domdata(ops, d);
     if ( d->sched_priv == NULL )
         return -ENOMEM;
 
-    memset(d->sched_priv, 0, sizeof(struct sedf_dom_info));
-
     return 0;
 }
 
-static void sedf_destroy_domain(struct domain *d)
+static void sedf_free_domdata(struct scheduler *ops, void *data)
+{
+    xfree(data);
+}
+
+static void sedf_destroy_domain(struct scheduler *ops, struct domain *d)
 {
-    xfree(d->sched_priv);
+    sedf_free_domdata(ops, d->sched_priv);
 }
 
-static int sedf_pick_cpu(struct vcpu *v)
+static int sedf_pick_cpu(struct scheduler *ops, struct vcpu *v)
 {
     cpumask_t online_affinity;
+    cpumask_t *online;
 
-    cpus_and(online_affinity, v->cpu_affinity, cpu_online_map);
+    online = SEDF_CPUONLINE(v->domain->cpupool);
+    cpus_and(online_affinity, v->cpu_affinity, *online);
     return first_cpu(online_affinity);
 }
 
@@ -751,7 +790,7 @@ static struct task_slice sedf_do_extra_s
    -timeslice for the current period used up
    -domain on waitqueue has started it's period
    -and various others ;) in general: determine which domain to run next*/
-static struct task_slice sedf_do_schedule(s_time_t now)
+static struct task_slice sedf_do_schedule(struct scheduler *ops, s_time_t now)
 {
     int                   cpu      = smp_processor_id();
     struct list_head     *runq     = RUNQ(cpu);
@@ -786,6 +825,13 @@ static struct task_slice sedf_do_schedul
     }
  check_waitq:
     update_queues(now, runq, waitq);
+
+    if ( unlikely(!cpu_isset(cpu, *SEDF_CPUONLINE(per_cpu(cpupool, cpu)))) )
+    {
+        ret.task = IDLETASK(cpu);
+        ret.time = SECONDS(1);
+        goto sched_done;
+    }
  
     /*now simply pick the first domain from the runqueue, which has the
       earliest deadline, because the list is sorted*/
@@ -848,7 +894,7 @@ static struct task_slice sedf_do_schedul
 }
 
 
-static void sedf_sleep(struct vcpu *d)
+static void sedf_sleep(struct scheduler *ops, struct vcpu *d)
 {
     PRINT(2,"sedf_sleep was called, domain-id %i.%i\n",
           d->domain->domain_id, d->vcpu_id);
@@ -1067,7 +1113,7 @@ static inline int should_switch(struct v
     return 1;
 }
 
-static void sedf_wake(struct vcpu *d)
+static void sedf_wake(struct scheduler *ops, struct vcpu *d)
 {
     s_time_t              now = NOW();
     struct sedf_vcpu_info* inf = EDOM_INFO(d);
@@ -1220,8 +1266,8 @@ static void sedf_dump_domain(struct vcpu
 }
 
 
-/* dumps all domains on hte specified cpu */
-static void sedf_dump_cpu_state(int i)
+/* dumps all domains on the specified cpu */
+static void sedf_dump_cpu_state(struct scheduler *ops, int i)
 {
     struct list_head      *list, *queue, *tmp;
     struct sedf_vcpu_info *d_inf;
@@ -1294,7 +1340,7 @@ static void sedf_dump_cpu_state(int i)
 
 
 /* Adjusts periods and slices of the domains accordingly to their weights. */
-static int sedf_adjust_weights(struct xen_domctl_scheduler_op *cmd)
+static int sedf_adjust_weights(struct cpupool *c, struct xen_domctl_scheduler_op *cmd)
 {
     struct vcpu *p;
     struct domain      *d;
@@ -1315,6 +1361,8 @@ static int sedf_adjust_weights(struct xe
     rcu_read_lock(&domlist_read_lock);
     for_each_domain( d )
     {
+        if ( c != d->cpupool )
+            continue;
         for_each_vcpu( d, p )
         {
             if ( EDOM_INFO(p)->weight )
@@ -1366,7 +1414,7 @@ static int sedf_adjust_weights(struct xe
 
 
 /* set or fetch domain scheduling parameters */
-static int sedf_adjust(struct domain *p, struct xen_domctl_scheduler_op *op)
+static int sedf_adjust(struct scheduler *ops, struct domain *p, struct xen_domctl_scheduler_op *op)
 {
     struct vcpu *v;
     int rc;
@@ -1376,9 +1424,6 @@ static int sedf_adjust(struct domain *p,
           p->domain_id, op->u.sedf.period, op->u.sedf.slice,
           op->u.sedf.latency, (op->u.sedf.extratime)?"yes":"no");
 
-    if ( !p->vcpu )
-        return -EINVAL;
-
     if ( op->cmd == XEN_DOMCTL_SCHEDOP_putinfo )
     {
         /* Check for sane parameters. */
@@ -1428,7 +1473,7 @@ static int sedf_adjust(struct domain *p,
             }
         }
 
-        rc = sedf_adjust_weights(op);
+        rc = sedf_adjust_weights(p->cpupool, op);
         if ( rc )
             return rc;
 
@@ -1456,7 +1501,7 @@ static int sedf_adjust(struct domain *p,
     return 0;
 }
 
-const struct scheduler sched_sedf_def = {
+struct scheduler sched_sedf_def = {
     .name     = "Simple EDF Scheduler",
     .opt_name = "sedf",
     .sched_id = XEN_SCHEDULER_SEDF,
@@ -1464,9 +1509,15 @@ const struct scheduler sched_sedf_def = 
     .init_domain    = sedf_init_domain,
     .destroy_domain = sedf_destroy_domain,
 
-    .init_vcpu      = sedf_init_vcpu,
     .destroy_vcpu   = sedf_destroy_vcpu,
 
+    .alloc_vdata    = sedf_alloc_vdata,
+    .free_vdata     = sedf_free_vdata,
+    .alloc_pdata    = sedf_alloc_pdata,
+    .free_pdata     = sedf_free_pdata,
+    .alloc_domdata  = sedf_alloc_domdata,
+    .free_domdata   = sedf_free_domdata,
+
     .do_schedule    = sedf_do_schedule,
     .pick_cpu       = sedf_pick_cpu,
     .dump_cpu_state = sedf_dump_cpu_state,
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -53,10 +53,11 @@ static void poll_timer_fn(void *data);
 
 /* This is global for now so that private implementations can reach it */
 DEFINE_PER_CPU(struct schedule_data, schedule_data);
+DEFINE_PER_CPU(struct scheduler *, scheduler);
 
-extern const struct scheduler sched_sedf_def;
-extern const struct scheduler sched_credit_def;
-static const struct scheduler *__initdata schedulers[] = {
+extern struct scheduler sched_sedf_def;
+extern struct scheduler sched_credit_def;
+static struct scheduler *schedulers[] = {
     &sched_sedf_def,
     &sched_credit_def,
     NULL
@@ -64,9 +65,15 @@ static const struct scheduler *__initdat
 
 static struct scheduler __read_mostly ops;
 
-#define SCHED_OP(fn, ...)                                 \
-         (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ )      \
-          : (typeof(ops.fn(__VA_ARGS__)))0 )
+#define SCHED_OP(opsptr, fn, ...)                                          \
+         (( (opsptr)->fn != NULL ) ? (opsptr)->fn(opsptr, ##__VA_ARGS__ )  \
+          : (typeof((opsptr)->fn(opsptr, ##__VA_ARGS__)))0 )
+
+#define DOM2OP(_d)    (((_d)->cpupool == NULL) ? &ops : &((_d)->cpupool->sched))
+#define VCPU2OP(_v)   (DOM2OP((_v)->domain))
+#define VCPU2ONLINE(_v)                                                    \
+         (((_v)->domain->cpupool == NULL) ? &cpu_online_map                \
+         : &(_v)->domain->cpupool->cpu_valid)
 
 static inline void trace_runstate_change(struct vcpu *v, int new_state)
 {
@@ -207,7 +214,86 @@ int sched_init_vcpu(struct vcpu *v, unsi
 
     TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id);
 
-    return SCHED_OP(init_vcpu, v);
+    if ( unlikely(per_cpu(schedule_data, v->processor).sched_priv == NULL) )
+    {
+        per_cpu(schedule_data, v->processor).sched_priv =
+            SCHED_OP(DOM2OP(d), alloc_pdata, processor);
+        if ( per_cpu(schedule_data, v->processor).sched_priv == NULL )
+            return 1;
+    }
+
+    v->sched_priv = SCHED_OP(DOM2OP(d), alloc_vdata, v, d->sched_priv);
+    if ( v->sched_priv == NULL )
+        return 1;
+
+    if ( is_idle_domain(d) )
+        per_cpu(schedule_data, v->processor).sched_idlevpriv = v->sched_priv;
+
+    return 0;
+}
+
+int sched_move_domain(struct domain *d, struct cpupool *c)
+{
+    struct vcpu *v;
+    unsigned int new_p;
+    void **vcpu_priv;
+    void *domdata;
+
+    domdata = SCHED_OP(&(c->sched), alloc_domdata, d);
+    if ( domdata == NULL )
+        return -ENOMEM;
+
+    vcpu_priv = xmalloc_array(void *, d->max_vcpus);
+    if ( vcpu_priv == NULL )
+    {
+        SCHED_OP(&(c->sched), free_domdata, domdata);
+        return -ENOMEM;
+    }
+
+    memset(vcpu_priv, 0, d->max_vcpus * sizeof(void *));
+    for_each_vcpu ( d, v )
+    {
+        vcpu_priv[v->vcpu_id] = SCHED_OP(&(c->sched), alloc_vdata, v, domdata);
+        if ( vcpu_priv[v->vcpu_id] == NULL )
+        {
+            for_each_vcpu ( d, v )
+            {
+                if ( vcpu_priv[v->vcpu_id] != NULL )
+                    xfree(vcpu_priv[v->vcpu_id]);
+            }
+            xfree(vcpu_priv);
+            SCHED_OP(&(c->sched), free_domdata, domdata);
+            return -ENOMEM;
+        }
+    }
+
+    domain_pause(d);
+
+    new_p = first_cpu(c->cpu_valid);
+    for_each_vcpu ( d, v )
+    {
+        migrate_timer(&v->periodic_timer, new_p);
+        migrate_timer(&v->singleshot_timer, new_p);
+        migrate_timer(&v->poll_timer, new_p);
+
+        SCHED_OP(VCPU2OP(v), destroy_vcpu, v);
+
+        cpus_setall(v->cpu_affinity);
+        v->processor = new_p;
+        v->sched_priv = vcpu_priv[v->vcpu_id];
+
+        new_p = cycle_cpu(new_p, c->cpu_valid);
+    }
+
+    d->cpupool = c;
+    SCHED_OP(DOM2OP(d), free_domdata, d->sched_priv);
+    d->sched_priv = domdata;
+
+    domain_unpause(d);
+
+    xfree(vcpu_priv);
+
+    return 0;
 }
 
 void sched_destroy_vcpu(struct vcpu *v)
@@ -217,17 +303,17 @@ void sched_destroy_vcpu(struct vcpu *v)
     kill_timer(&v->poll_timer);
     if ( test_and_clear_bool(v->is_urgent) )
         atomic_dec(&per_cpu(schedule_data, v->processor).urgent_count);
-    SCHED_OP(destroy_vcpu, v);
+    SCHED_OP(VCPU2OP(v), destroy_vcpu, v);
 }
 
 int sched_init_domain(struct domain *d)
 {
-    return SCHED_OP(init_domain, d);
+    return SCHED_OP(DOM2OP(d), init_domain, d);
 }
 
 void sched_destroy_domain(struct domain *d)
 {
-    SCHED_OP(destroy_domain, d);
+    SCHED_OP(DOM2OP(d), destroy_domain, d);
 }
 
 void vcpu_sleep_nosync(struct vcpu *v)
@@ -241,7 +327,7 @@ void vcpu_sleep_nosync(struct vcpu *v)
         if ( v->runstate.state == RUNSTATE_runnable )
             vcpu_runstate_change(v, RUNSTATE_offline, NOW());
 
-        SCHED_OP(sleep, v);
+        SCHED_OP(VCPU2OP(v), sleep, v);
     }
 
     vcpu_schedule_unlock_irqrestore(v, flags);
@@ -269,7 +355,7 @@ void vcpu_wake(struct vcpu *v)
     {
         if ( v->runstate.state >= RUNSTATE_blocked )
             vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
-        SCHED_OP(wake, v);
+        SCHED_OP(VCPU2OP(v), wake, v);
     }
     else if ( !test_bit(_VPF_blocked, &v->pause_flags) )
     {
@@ -324,7 +410,7 @@ static void vcpu_migrate(struct vcpu *v)
 
     /* Select new CPU. */
     old_cpu = v->processor;
-    new_cpu = SCHED_OP(pick_cpu, v);
+    new_cpu = SCHED_OP(VCPU2OP(v), pick_cpu, v);
 
     /*
      * Transfer urgency status to new CPU before switching CPUs, as once
@@ -367,22 +453,32 @@ void vcpu_force_reschedule(struct vcpu *
 }
 
 /*
- * This function is used by cpu_hotplug code from stop_machine context.
- * Hence we can avoid needing to take the 
+ * This function is used by cpu_hotplug code from stop_machine context
+ * and from cpupools to switch schedulers on a cpu.
  */
-void cpu_disable_scheduler(void)
+int cpu_disable_scheduler(unsigned int cpu, int lock)
 {
     struct domain *d;
     struct vcpu *v;
-    unsigned int cpu = smp_processor_id();
+    struct cpupool *c;
+    int    ret = 0;
+
+    c = per_cpu(cpupool, cpu);
+    if ( c == NULL )
+        return ret;
 
     for_each_domain ( d )
     {
+        if ( d->cpupool != c )
+            continue;
+
         for_each_vcpu ( d, v )
         {
             if ( is_idle_vcpu(v) )
                 continue;
 
+            if ( lock != 0 )
+                vcpu_schedule_lock_irq(v);
             if ( (cpus_weight(v->cpu_affinity) == 1) &&
                  cpu_isset(cpu, v->cpu_affinity) )
             {
@@ -396,39 +492,51 @@ void cpu_disable_scheduler(void)
              * be chosen when the timer is next re-set.
              */
             if ( v->singleshot_timer.cpu == cpu )
-                migrate_timer(&v->singleshot_timer, 0);
+            {
+                int cpu_mig;
+
+                cpu_mig = first_cpu(c->cpu_valid);
+                if (cpu_mig == cpu)
+                    cpu_mig = next_cpu(cpu_mig, c->cpu_valid);
+                migrate_timer(&v->singleshot_timer, cpu_mig);
+            }
 
             if ( v->processor == cpu )
             {
                 set_bit(_VPF_migrating, &v->pause_flags);
+                if ( lock != 0 )
+                    vcpu_schedule_unlock_irq(v);
                 vcpu_sleep_nosync(v);
                 vcpu_migrate(v);
             }
+            else if ( lock != 0 )
+                vcpu_schedule_unlock_irq(v);
+            /*
+             * A vcpu active in the hypervisor will not be migratable.
+             * The caller should try again after releasing and reaquiring
+             * all locks.
+             */
+            if ( v->processor == cpu )
+                ret = -EAGAIN;
         }
     }
+    return ret;
 }
 
-static int __vcpu_set_affinity(
-    struct vcpu *v, cpumask_t *affinity,
-    bool_t old_lock_status, bool_t new_lock_status)
+int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
 {
     cpumask_t online_affinity, old_affinity;
+    cpumask_t *online;
 
-    cpus_and(online_affinity, *affinity, cpu_online_map);
+    if ( v->domain->is_pinned )
+        return -EINVAL;
+    online = VCPU2ONLINE(v);
+    cpus_and(online_affinity, *affinity, *online);
     if ( cpus_empty(online_affinity) )
         return -EINVAL;
 
     vcpu_schedule_lock_irq(v);
 
-    if ( v->affinity_locked != old_lock_status )
-    {
-        BUG_ON(!v->affinity_locked);
-        vcpu_schedule_unlock_irq(v);
-        return -EBUSY;
-    }
-
-    v->affinity_locked = new_lock_status;
-
     old_affinity = v->cpu_affinity;
     v->cpu_affinity = *affinity;
     *affinity = old_affinity;
@@ -446,36 +554,6 @@ static int __vcpu_set_affinity(
     return 0;
 }
 
-int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
-{
-    if ( v->domain->is_pinned )
-        return -EINVAL;
-    return __vcpu_set_affinity(v, affinity, 0, 0);
-}
-
-int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity)
-{
-    return __vcpu_set_affinity(v, affinity, 0, 1);
-}
-
-int vcpu_locked_change_affinity(struct vcpu *v, cpumask_t *affinity)
-{
-    return __vcpu_set_affinity(v, affinity, 1, 1);
-}
-
-void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity)
-{
-    cpumask_t online_affinity;
-
-    /* Do not fail if no CPU in old affinity mask is online. */
-    cpus_and(online_affinity, *affinity, cpu_online_map);
-    if ( cpus_empty(online_affinity) )
-        *affinity = cpu_online_map;
-
-    if ( __vcpu_set_affinity(v, affinity, 1, 0) != 0 )
-        BUG();
-}
-
 /* Block the currently-executing domain until a pertinent event occurs. */
 static long do_block(void)
 {
@@ -783,7 +861,7 @@ long sched_adjust(struct domain *d, stru
     struct vcpu *v;
     long ret;
     
-    if ( (op->sched_id != ops.sched_id) ||
+    if ( (op->sched_id != DOM2OP(d)->sched_id) ||
          ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
           (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) )
         return -EINVAL;
@@ -810,7 +888,7 @@ long sched_adjust(struct domain *d, stru
     if ( d == current->domain )
         vcpu_schedule_lock_irq(current);
 
-    if ( (ret = SCHED_OP(adjust, d, op)) == 0 )
+    if ( (ret = SCHED_OP(DOM2OP(d), adjust, d, op)) == 0 )
         TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
 
     if ( d == current->domain )
@@ -857,6 +935,7 @@ static void schedule(void)
 {
     struct vcpu          *prev = current, *next = NULL;
     s_time_t              now = NOW();
+    struct scheduler     *sched = this_cpu(scheduler);
     struct schedule_data *sd;
     struct task_slice     next_slice;
 
@@ -872,7 +951,7 @@ static void schedule(void)
     stop_timer(&sd->s_timer);
     
     /* get policy-specific decision on scheduling... */
-    next_slice = ops.do_schedule(now);
+    next_slice = sched->do_schedule(sched, now);
 
     next = next_slice.task;
 
@@ -978,6 +1057,19 @@ static void poll_timer_fn(void *data)
         vcpu_unblock(v);
 }
 
+/* Get scheduler by id */
+struct scheduler *scheduler_get_by_id(unsigned int id)
+{
+    int i;
+
+    for ( i = 0; schedulers[i] != NULL; i++ )
+    {
+        if ( schedulers[i]->sched_id == id )
+            return schedulers[i];
+    }
+    return NULL;
+}
+
 /* Initialise the data structures. */
 void __init scheduler_init(void)
 {
@@ -985,12 +1077,6 @@ void __init scheduler_init(void)
 
     open_softirq(SCHEDULE_SOFTIRQ, schedule);
 
-    for_each_possible_cpu ( i )
-    {
-        spin_lock_init(&per_cpu(schedule_data, i).schedule_lock);
-        init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i);
-    }
-
     for ( i = 0; schedulers[i] != NULL; i++ )
     {
         ops = *schedulers[i];
@@ -1004,43 +1090,123 @@ void __init scheduler_init(void)
         ops = *schedulers[0];
     }
 
+    for_each_possible_cpu ( i )
+    {
+        per_cpu(scheduler, i) = &ops;
+        spin_lock_init(&per_cpu(schedule_data, i).schedule_lock);
+        init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i);
+    }
+
     printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
-    SCHED_OP(init);
+    if ( SCHED_OP(&ops, init) )
+        panic("scheduler returned error on init\n");
 }
 
-void dump_runq(unsigned char key)
+/* switch scheduler on cpu */
+void schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
 {
-    s_time_t      now = NOW();
-    int           i;
     unsigned long flags;
+    struct vcpu *v;
+    void *vpriv = NULL;
+    void *ppriv;
+    void *ppriv_old;
+    struct scheduler *old_ops;
+    struct scheduler *new_ops;
+
+    old_ops = per_cpu(scheduler, cpu);
+    new_ops = (c == NULL) ? &ops : &(c->sched);
+    v = per_cpu(schedule_data, cpu).idle;
+    ppriv = SCHED_OP(new_ops, alloc_pdata, cpu);
+    if ( c != NULL )
+        vpriv = SCHED_OP(new_ops, alloc_vdata, v, v->domain->sched_priv);
+
+    spin_lock_irqsave(&per_cpu(schedule_data, cpu).schedule_lock, flags);
+
+    if ( c == NULL )
+    {
+        vpriv = v->sched_priv;
+        v->sched_priv = per_cpu(schedule_data, cpu).sched_idlevpriv;
+    }
+    else
+    {
+        v->sched_priv = vpriv;
+        vpriv = NULL;
+    }
+    SCHED_OP(old_ops, tick_suspend, cpu);
+    per_cpu(scheduler, cpu) = new_ops;
+    ppriv_old = per_cpu(schedule_data, cpu).sched_priv;
+    per_cpu(schedule_data, cpu).sched_priv = ppriv;
+    SCHED_OP(new_ops, tick_resume, cpu);
+    SCHED_OP(new_ops, insert_vcpu, v);
+
+    spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags);
+
+    if ( vpriv != NULL )
+        SCHED_OP(old_ops, free_vdata, vpriv);
+    SCHED_OP(old_ops, free_pdata, ppriv_old, cpu);
+}
+
+/* init scheduler global data */
+int schedule_init_global(char *name, struct scheduler *sched)
+{
+    int i;
+    struct scheduler *data;
+
+    data = &ops;
+    for ( i = 0; (schedulers[i] != NULL) && (name != NULL) ; i++ )
+    {
+        if ( strcmp(schedulers[i]->opt_name, name) == 0 )
+        {
+            data = schedulers[i];
+            break;
+        }
+    }
+    memcpy(sched, data, sizeof(*sched));
+    return SCHED_OP(sched, init);
+}
 
-    local_irq_save(flags);
+/* deinitialize scheduler global data */
+void schedule_deinit_global(struct scheduler *sched)
+{
+    SCHED_OP(sched, deinit);
+}
 
-    printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
-    SCHED_OP(dump_settings);
-    printk("sched_smt_power_savings: %s\n",
-            sched_smt_power_savings? "enabled":"disabled");
-    printk("NOW=0x%08X%08X\n",  (u32)(now>>32), (u32)now);
+void schedule_dump(struct cpupool *c)
+{
+    int               i;
+    struct scheduler *sched;
+    cpumask_t        *cpus;
+
+    sched = (c == NULL) ? &ops : &(c->sched);
+    cpus = (c == NULL) ? &cpupool_free_cpus : &c->cpu_valid;
+    printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
+    SCHED_OP(sched, dump_settings);
 
-    for_each_online_cpu ( i )
+    for_each_cpu_mask (i, *cpus)
     {
         spin_lock(&per_cpu(schedule_data, i).schedule_lock);
         printk("CPU[%02d] ", i);
-        SCHED_OP(dump_cpu_state, i);
+        SCHED_OP(sched, dump_cpu_state, i);
         spin_unlock(&per_cpu(schedule_data, i).schedule_lock);
     }
-
-    local_irq_restore(flags);
 }
 
 void sched_tick_suspend(void)
 {
-    SCHED_OP(tick_suspend);
+    struct scheduler *sched;
+    unsigned int cpu = smp_processor_id();
+
+    sched = per_cpu(scheduler, cpu);
+    SCHED_OP(sched, tick_suspend, cpu);
 }
 
 void sched_tick_resume(void)
 {
-    SCHED_OP(tick_resume);
+    struct scheduler *sched;
+    unsigned int cpu = smp_processor_id();
+
+    sched = per_cpu(scheduler, cpu);
+    SCHED_OP(sched, tick_resume, cpu);
 }
 
 #ifdef CONFIG_COMPAT
--- a/xen/common/softirq.c
+++ b/xen/common/softirq.c
@@ -88,9 +88,11 @@ void raise_softirq(unsigned int nr)
 }
 
 static LIST_HEAD(tasklet_list);
+static DEFINE_PER_CPU(struct list_head, tasklet_list_pcpu);
 static DEFINE_SPINLOCK(tasklet_lock);
 
-void tasklet_schedule(struct tasklet *t)
+static void tasklet_schedule_list(struct tasklet *t, struct list_head *tlist,
+    int cpu)
 {
     unsigned long flags;
 
@@ -101,28 +103,44 @@ void tasklet_schedule(struct tasklet *t)
         if ( !t->is_scheduled && !t->is_running )
         {
             BUG_ON(!list_empty(&t->list));
-            list_add_tail(&t->list, &tasklet_list);
+            list_add_tail(&t->list, tlist);
         }
         t->is_scheduled = 1;
-        raise_softirq(TASKLET_SOFTIRQ);
+        if ( cpu == smp_processor_id() )
+            raise_softirq(TASKLET_SOFTIRQ);
+        else
+            cpu_raise_softirq(cpu, TASKLET_SOFTIRQ);
     }
 
     spin_unlock_irqrestore(&tasklet_lock, flags);
 }
 
+void tasklet_schedule(struct tasklet *t)
+{
+    tasklet_schedule_list(t, &tasklet_list, smp_processor_id());
+}
+
+void tasklet_schedule_cpu(struct tasklet *t, int cpu)
+{
+    tasklet_schedule_list(t, &per_cpu(tasklet_list_pcpu, cpu), cpu);
+}
+
 static void tasklet_action(void)
 {
+    struct list_head *tlist;
     struct tasklet *t;
 
     spin_lock_irq(&tasklet_lock);
 
-    if ( list_empty(&tasklet_list) )
+    tlist = ( list_empty(&this_cpu(tasklet_list_pcpu)) ) ? &tasklet_list :
+        &this_cpu(tasklet_list_pcpu);
+    if ( list_empty(tlist) )
     {
         spin_unlock_irq(&tasklet_lock);
         return;
     }
 
-    t = list_entry(tasklet_list.next, struct tasklet, list);
+    t = list_entry(tlist->next, struct tasklet, list);
     list_del_init(&t->list);
 
     BUG_ON(t->is_dead || t->is_running || !t->is_scheduled);
@@ -138,14 +156,15 @@ static void tasklet_action(void)
     if ( t->is_scheduled )
     {
         BUG_ON(t->is_dead || !list_empty(&t->list));
-        list_add_tail(&t->list, &tasklet_list);
+        list_add_tail(&t->list, tlist);
     }
 
     /*
      * If there is more work to do then reschedule. We don't grab more work
      * immediately as we want to allow other softirq work to happen first.
      */
-    if ( !list_empty(&tasklet_list) )
+    if ( !list_empty(&tasklet_list) ||
+        !list_empty(&this_cpu(tasklet_list_pcpu)) )
         raise_softirq(TASKLET_SOFTIRQ);
 
     spin_unlock_irq(&tasklet_lock);
@@ -186,6 +205,12 @@ void tasklet_init(
 
 void __init softirq_init(void)
 {
+    int i;
+
+    for_each_possible_cpu ( i )
+    {
+        INIT_LIST_HEAD(&per_cpu(tasklet_list_pcpu, i));
+    }
     open_softirq(TASKLET_SOFTIRQ, tasklet_action);
 }
 
--- a/xen/common/sysctl.c
+++ b/xen/common/sysctl.c
@@ -314,6 +314,14 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc
     }
     break;
 
+    case XEN_SYSCTL_cpupool_op:
+    {
+        ret = cpupool_do_sysctl(&op->u.cpupool_op);
+        if ( (ret == 0) && copy_to_guest(u_sysctl, op, 1) )
+            ret = -EFAULT;
+    }
+    break;
+
     default:
         ret = arch_do_sysctl(op, u_sysctl);
         break;
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -458,7 +458,8 @@ struct arch_vcpu
 #define hvm_svm         hvm_vcpu.u.svm
 
 /* Continue the current hypercall via func(data) on specified cpu. */
-int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data);
+int continue_hypercall_on_cpu(int cpu, void *hdl,
+                              long (*func)(void *hdl, void *data), void *data);
 
 void vcpu_show_execution_state(struct vcpu *);
 void vcpu_show_registers(const struct vcpu *);
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -60,10 +60,10 @@ struct xen_domctl_createdomain {
  /* Should domain memory integrity be verifed by tboot during Sx? */
 #define _XEN_DOMCTL_CDF_s3_integrity  2
 #define XEN_DOMCTL_CDF_s3_integrity   (1U<<_XEN_DOMCTL_CDF_s3_integrity)
-    uint32_t flags;
  /* Disable out-of-sync shadow page tables? */
 #define _XEN_DOMCTL_CDF_oos_off       3
 #define XEN_DOMCTL_CDF_oos_off        (1U<<_XEN_DOMCTL_CDF_oos_off)
+    uint32_t flags;
 };
 typedef struct xen_domctl_createdomain xen_domctl_createdomain_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_createdomain_t);
@@ -106,6 +106,7 @@ struct xen_domctl_getdomaininfo {
     uint32_t max_vcpu_id;        /* Maximum VCPUID in use by this domain. */
     uint32_t ssidref;
     xen_domain_handle_t handle;
+    uint32_t cpupool;
 };
 typedef struct xen_domctl_getdomaininfo xen_domctl_getdomaininfo_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_getdomaininfo_t);
@@ -781,7 +782,6 @@ struct xen_domctl_mem_sharing_op {
 typedef struct xen_domctl_mem_sharing_op xen_domctl_mem_sharing_op_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_mem_sharing_op_t);
 
-
 struct xen_domctl {
     uint32_t cmd;
 #define XEN_DOMCTL_createdomain                   1
--- a/xen/include/public/sysctl.h
+++ b/xen/include/public/sysctl.h
@@ -491,6 +491,28 @@ struct xen_sysctl_lockprof_op {
 typedef struct xen_sysctl_lockprof_op xen_sysctl_lockprof_op_t;
 DEFINE_XEN_GUEST_HANDLE(xen_sysctl_lockprof_op_t);
 
+#define XEN_SYSCTL_cpupool_op        18
+/* XEN_SYSCTL_cpupool_op */
+#define XEN_SYSCTL_CPUPOOL_OP_CREATE                1  /* C */
+#define XEN_SYSCTL_CPUPOOL_OP_DESTROY               2  /* D */
+#define XEN_SYSCTL_CPUPOOL_OP_INFO                  3  /* I */
+#define XEN_SYSCTL_CPUPOOL_OP_ADDCPU                4  /* A */
+#define XEN_SYSCTL_CPUPOOL_OP_RMCPU                 5  /* R */
+#define XEN_SYSCTL_CPUPOOL_OP_MOVEDOMAIN            6  /* M */
+#define XEN_SYSCTL_CPUPOOL_OP_FREEINFO              7  /* F */
+#define XEN_SYSCTL_CPUPOOL_PAR_ANY     0xFFFFFFFF
+struct xen_sysctl_cpupool_op {
+    uint32_t op;          /* IN */
+    uint32_t cpupool_id;  /* IN: CDIARM OUT: CI */
+    uint32_t sched_id;    /* IN: C      OUT: I  */
+    uint32_t domid;       /* IN: M              */
+    uint32_t cpu;         /* IN: AR             */
+    uint32_t n_dom;       /*            OUT: I  */
+    struct xenctl_cpumap cpumap; /*     OUT: IF */
+};
+typedef struct xen_sysctl_cpupool_op xen_sysctl_cpupool_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpupool_op_t);
+
 struct xen_sysctl {
     uint32_t cmd;
     uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
@@ -509,6 +531,7 @@ struct xen_sysctl {
         struct xen_sysctl_pm_op             pm_op;
         struct xen_sysctl_page_offline_op   page_offline;
         struct xen_sysctl_lockprof_op       lockprof_op;
+        struct xen_sysctl_cpupool_op        cpupool_op;
         uint8_t                             pad[128];
     } u;
 };
--- a/xen/include/xen/sched-if.h
+++ b/xen/include/xen/sched-if.h
@@ -10,16 +10,26 @@
 
 #include <xen/percpu.h>
 
+
+/* A global pointer to the initial cpupool (POOL0). */
+extern struct cpupool *cpupool0;
+
+/* cpus currently in no cpupool */
+extern cpumask_t cpupool_free_cpus;
+
 struct schedule_data {
     spinlock_t          schedule_lock;  /* spinlock protecting curr        */
     struct vcpu        *curr;           /* current task                    */
     struct vcpu        *idle;           /* idle task for this cpu          */
     void               *sched_priv;
+    void               *sched_idlevpriv; /* default scheduler vcpu data    */
     struct timer        s_timer;        /* scheduling timer                */
     atomic_t            urgent_count;   /* how many urgent vcpus           */
 } __cacheline_aligned;
 
 DECLARE_PER_CPU(struct schedule_data, schedule_data);
+DECLARE_PER_CPU(struct scheduler *, scheduler);
+DECLARE_PER_CPU(struct cpupool *, cpupool);
 
 static inline void vcpu_schedule_lock(struct vcpu *v)
 {
@@ -59,28 +69,49 @@ struct scheduler {
     char *name;             /* full name for this scheduler      */
     char *opt_name;         /* option name for this scheduler    */
     unsigned int sched_id;  /* ID for this scheduler             */
+    void *sched_data;       /* global data pointer               */
+
+    int          (*init)           (struct scheduler *);
+    void         (*deinit)         (struct scheduler *);
 
-    void         (*init)           (void);
+    void         (*free_vdata)     (struct scheduler *, void *);
+    void *       (*alloc_vdata)    (struct scheduler *, struct vcpu *,
+                                    void *);
+    void         (*free_pdata)     (struct scheduler *, void *, int);
+    void *       (*alloc_pdata)    (struct scheduler *, int);
+    void         (*free_domdata)   (struct scheduler *, void *);
+    void *       (*alloc_domdata)  (struct scheduler *, struct domain *);
 
-    int          (*init_domain)    (struct domain *);
-    void         (*destroy_domain) (struct domain *);
+    int          (*init_domain)    (struct scheduler *, struct domain *);
+    void         (*destroy_domain) (struct scheduler *, struct domain *);
 
-    int          (*init_vcpu)      (struct vcpu *);
-    void         (*destroy_vcpu)   (struct vcpu *);
+    void         (*insert_vcpu)    (struct scheduler *, struct vcpu *);
+    void         (*destroy_vcpu)   (struct scheduler *, struct vcpu *);
 
-    void         (*sleep)          (struct vcpu *);
-    void         (*wake)           (struct vcpu *);
+    void         (*sleep)          (struct scheduler *, struct vcpu *);
+    void         (*wake)           (struct scheduler *, struct vcpu *);
 
-    struct task_slice (*do_schedule) (s_time_t);
+    struct task_slice (*do_schedule) (struct scheduler *, s_time_t);
 
-    int          (*pick_cpu)       (struct vcpu *);
-    int          (*adjust)         (struct domain *,
+    int          (*pick_cpu)       (struct scheduler *, struct vcpu *);
+    int          (*adjust)         (struct scheduler *, struct domain *,
                                     struct xen_domctl_scheduler_op *);
-    void         (*dump_settings)  (void);
-    void         (*dump_cpu_state) (int);
+    void         (*dump_settings)  (struct scheduler *);
+    void         (*dump_cpu_state) (struct scheduler *, int);
 
-    void         (*tick_suspend)    (void);
-    void         (*tick_resume)     (void);
+    void         (*tick_suspend)    (struct scheduler *, unsigned int);
+    void         (*tick_resume)     (struct scheduler *, unsigned int);
 };
 
+struct cpupool
+{
+    int              cpupool_id;
+    cpumask_t        cpu_valid;      /* all cpus assigned to pool */
+    struct cpupool   *next;
+    unsigned int     n_dom;
+    struct scheduler sched;
+};
+
+struct scheduler *scheduler_get_by_id(unsigned int id);
+
 #endif /* __XEN_SCHED_IF_H__ */
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -9,6 +9,7 @@
 #include <xen/shared.h>
 #include <public/xen.h>
 #include <public/domctl.h>
+#include <public/sysctl.h>
 #include <public/vcpu.h>
 #include <public/xsm/acm.h>
 #include <xen/time.h>
@@ -132,8 +133,6 @@ struct vcpu 
     bool_t           defer_shutdown;
     /* VCPU is paused following shutdown request (d->is_shutting_down)? */
     bool_t           paused_for_shutdown;
-    /* VCPU affinity is temporarily locked from controller changes? */
-    bool_t           affinity_locked;
 
     /*
      * > 0: a single port is being polled;
@@ -209,6 +208,7 @@ struct domain
 
     /* Scheduling. */
     void            *sched_priv;    /* scheduler-specific data */
+    struct cpupool  *cpupool;
 
     struct domain   *next_in_list;
     struct domain   *next_in_hashbucket;
@@ -381,7 +381,7 @@ static inline struct domain *get_current
 }
 
 struct domain *domain_create(
-    domid_t domid, unsigned int domcr_flags, ssidref_t ssidref);
+    domid_t domid, int poolid, unsigned int domcr_flags, ssidref_t ssidref);
  /* DOMCRF_hvm: Create an HVM domain, as opposed to a PV domain. */
 #define _DOMCRF_hvm           0
 #define DOMCRF_hvm            (1U<<_DOMCRF_hvm)
@@ -469,6 +469,7 @@ int  sched_init_vcpu(struct vcpu *v, uns
 void sched_destroy_vcpu(struct vcpu *v);
 int  sched_init_domain(struct domain *d);
 void sched_destroy_domain(struct domain *d);
+int sched_move_domain(struct domain *d, struct cpupool *c);
 long sched_adjust(struct domain *, struct xen_domctl_scheduler_op *);
 int  sched_id(void);
 void sched_tick_suspend(void);
@@ -576,12 +577,14 @@ void domain_pause_by_systemcontroller(st
 void domain_unpause_by_systemcontroller(struct domain *d);
 void cpu_init(void);
 
+struct scheduler;
+
+int schedule_init_global(char *name, struct scheduler *sched);
+void schedule_deinit_global(struct scheduler *sched);
+void schedule_cpu_switch(unsigned int cpu, struct cpupool *c);
 void vcpu_force_reschedule(struct vcpu *v);
-void cpu_disable_scheduler(void);
+int cpu_disable_scheduler(unsigned int cpu, int lock);
 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity);
-int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity);
-int vcpu_locked_change_affinity(struct vcpu *v, cpumask_t *affinity);
-void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity);
 
 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate);
 uint64_t get_cpu_idle_time(unsigned int cpu);
@@ -604,6 +607,19 @@ extern enum cpufreq_controller {
     FREQCTL_none, FREQCTL_dom0_kernel, FREQCTL_xen
 } cpufreq_controller;
 
+#define CPUPOOLID_NONE    -1
+
+struct cpupool *cpupool_create(int poolid, char *sched);
+int cpupool_destroy(struct cpupool *c);
+int cpupool0_cpu_assign(struct cpupool *c);
+int cpupool_assign_ncpu(struct cpupool *c, int ncpu);
+void cpupool_cpu_add(unsigned int cpu);
+int cpupool_cpu_remove(unsigned int cpu);
+int cpupool_add_domain(struct domain *d, int poolid);
+void cpupool_rm_domain(struct domain *d);
+int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op);
+#define num_cpupool_cpus(c) (cpus_weight((c)->cpu_valid))
+
 #endif /* __SCHED_H__ */
 
 /*
--- a/xen/include/xen/softirq.h
+++ b/xen/include/xen/softirq.h
@@ -58,6 +58,7 @@ struct tasklet
     struct tasklet name = { LIST_HEAD_INIT(name.list), 0, 0, 0, func, data }
 
 void tasklet_schedule(struct tasklet *t);
+void tasklet_schedule_cpu(struct tasklet *t, int cpu);
 void tasklet_kill(struct tasklet *t);
 void tasklet_init(
     struct tasklet *t, void (*func)(unsigned long), unsigned long data);