From: Juergen Gross Index: xen-4.0.1-testing/xen/arch/x86/acpi/power.c =================================================================== --- xen-4.0.1-testing.orig/xen/arch/x86/acpi/power.c +++ xen-4.0.1-testing/xen/arch/x86/acpi/power.c @@ -234,7 +234,7 @@ static int enter_state(u32 state) return error; } -static long enter_state_helper(void *data) +static long enter_state_helper(void *hdl, void *data) { struct acpi_sleep_info *sinfo = (struct acpi_sleep_info *)data; return enter_state(sinfo->sleep_state); @@ -265,7 +265,7 @@ int acpi_enter_sleep(struct xenpf_enter_ acpi_sinfo.pm1b_cnt_val = sleep->pm1b_cnt_val; acpi_sinfo.sleep_state = sleep->sleep_state; - return continue_hypercall_on_cpu(0, enter_state_helper, &acpi_sinfo); + return continue_hypercall_on_cpu(0, NULL, enter_state_helper, &acpi_sinfo); } static int acpi_get_wake_status(void) Index: xen-4.0.1-testing/xen/arch/x86/domain.c =================================================================== --- xen-4.0.1-testing.orig/xen/arch/x86/domain.c +++ xen-4.0.1-testing/xen/arch/x86/domain.c @@ -1522,42 +1522,52 @@ void sync_vcpu_execstate(struct vcpu *v) } struct migrate_info { - long (*func)(void *data); + struct tasklet tasklet; + long (*func)(void *hdl, void *data); void *data; void (*saved_schedule_tail)(struct vcpu *); - cpumask_t saved_affinity; - unsigned int nest; + volatile int nest; + long ret; + struct vcpu *v; }; static void continue_hypercall_on_cpu_helper(struct vcpu *v) { struct cpu_user_regs *regs = guest_cpu_user_regs(); struct migrate_info *info = v->arch.continue_info; - cpumask_t mask = info->saved_affinity; void (*saved_schedule_tail)(struct vcpu *) = info->saved_schedule_tail; - regs->eax = info->func(info->data); + regs->eax = info->ret; - if ( info->nest-- == 0 ) - { - xfree(info); - v->arch.schedule_tail = saved_schedule_tail; - v->arch.continue_info = NULL; - vcpu_unlock_affinity(v, &mask); - } + tasklet_kill(&info->tasklet); + xfree(info); + v->arch.schedule_tail = saved_schedule_tail; + v->arch.continue_info = NULL; (*saved_schedule_tail)(v); } -int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data) +static void continue_hypercall_on_cpu_tasklet(struct migrate_info *info) +{ + info->ret = info->func((void *)info, info->data); + + if ( info->nest-- == 0 ) + vcpu_unpause(info->v); + + return; +} + +int continue_hypercall_on_cpu(int cpu, void *hdl, + long (*func)(void *hdl, void *data), void *data) { struct vcpu *v = current; - struct migrate_info *info; - cpumask_t mask = cpumask_of_cpu(cpu); - int rc; + struct migrate_info *info = (struct migrate_info *)hdl; if ( cpu == smp_processor_id() ) - return func(data); + return func(info, data); + + if ( info != NULL ) + v = info->v; info = v->arch.continue_info; if ( info == NULL ) @@ -1566,16 +1576,12 @@ int continue_hypercall_on_cpu(int cpu, l if ( info == NULL ) return -ENOMEM; - rc = vcpu_lock_affinity(v, &mask); - if ( rc ) - { - xfree(info); - return rc; - } - info->saved_schedule_tail = v->arch.schedule_tail; - info->saved_affinity = mask; info->nest = 0; + info->v = v; + tasklet_init(&info->tasklet, + (void(*)(unsigned long))continue_hypercall_on_cpu_tasklet, + (unsigned long)info); v->arch.schedule_tail = continue_hypercall_on_cpu_helper; v->arch.continue_info = info; @@ -1583,17 +1589,17 @@ int continue_hypercall_on_cpu(int cpu, l else { BUG_ON(info->nest != 0); - rc = vcpu_locked_change_affinity(v, &mask); - if ( rc ) - return rc; info->nest++; } info->func = func; info->data = data; + vcpu_pause_nosync(v); + tasklet_schedule_cpu(&info->tasklet, cpu); + raise_softirq(SCHEDULE_SOFTIRQ); + /* Dummy return value will be overwritten by new schedule_tail. */ - BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id()))); return 0; } Index: xen-4.0.1-testing/xen/arch/x86/domain_build.c =================================================================== --- xen-4.0.1-testing.orig/xen/arch/x86/domain_build.c +++ xen-4.0.1-testing/xen/arch/x86/domain_build.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -84,7 +85,7 @@ integer_param("dom0_max_vcpus", opt_dom0 struct vcpu *__init alloc_dom0_vcpu0(void) { if ( opt_dom0_max_vcpus == 0 ) - opt_dom0_max_vcpus = num_online_cpus(); + opt_dom0_max_vcpus = num_cpupool_cpus(cpupool0); if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS ) opt_dom0_max_vcpus = MAX_VIRT_CPUS; @@ -287,7 +288,7 @@ int __init construct_dom0( unsigned long _initrd_start, unsigned long initrd_len, char *cmdline) { - int i, rc, compatible, compat32, order, machine; + int i, cpu, rc, compatible, compat32, order, machine; struct cpu_user_regs *regs; unsigned long pfn, mfn; unsigned long nr_pages; @@ -786,8 +787,12 @@ int __init construct_dom0( printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus); + cpu = first_cpu(cpupool0->cpu_valid); for ( i = 1; i < opt_dom0_max_vcpus; i++ ) - (void)alloc_vcpu(d, i, i % num_online_cpus()); + { + cpu = cycle_cpu(cpu, cpupool0->cpu_valid); + (void)alloc_vcpu(d, i, cpu); + } /* Set up CR3 value for write_ptbase */ if ( paging_mode_enabled(d) ) Index: xen-4.0.1-testing/xen/arch/x86/microcode.c =================================================================== --- xen-4.0.1-testing.orig/xen/arch/x86/microcode.c +++ xen-4.0.1-testing/xen/arch/x86/microcode.c @@ -114,7 +114,7 @@ static int microcode_update_cpu(const vo return err; } -static long do_microcode_update(void *_info) +static long do_microcode_update(void *hdl, void *_info) { struct microcode_info *info = _info; int error; @@ -127,7 +127,8 @@ static long do_microcode_update(void *_i info->cpu = next_cpu(info->cpu, cpu_online_map); if ( info->cpu < NR_CPUS ) - return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info); + return continue_hypercall_on_cpu(info->cpu, hdl, + do_microcode_update, info); error = info->error; xfree(info); @@ -160,5 +161,6 @@ int microcode_update(XEN_GUEST_HANDLE(co info->error = 0; info->cpu = first_cpu(cpu_online_map); - return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info); + return continue_hypercall_on_cpu(info->cpu, NULL, + do_microcode_update, info); } Index: xen-4.0.1-testing/xen/arch/x86/mm.c =================================================================== --- xen-4.0.1-testing.orig/xen/arch/x86/mm.c +++ xen-4.0.1-testing/xen/arch/x86/mm.c @@ -243,7 +243,7 @@ void __init arch_init_memory(void) * Any Xen-heap pages that we will allow to be mapped will have * their domain field set to dom_xen. */ - dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0); + dom_xen = domain_create(DOMID_XEN, CPUPOOLID_NONE, DOMCRF_dummy, 0); BUG_ON(dom_xen == NULL); /* @@ -251,14 +251,14 @@ void __init arch_init_memory(void) * This domain owns I/O pages that are within the range of the page_info * array. Mappings occur at the priv of the caller. */ - dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0); + dom_io = domain_create(DOMID_IO, CPUPOOLID_NONE, DOMCRF_dummy, 0); BUG_ON(dom_io == NULL); /* * Initialise our DOMID_IO domain. * This domain owns sharable pages. */ - dom_cow = domain_create(DOMID_COW, DOMCRF_dummy, 0); + dom_cow = domain_create(DOMID_COW, CPUPOOLID_NONE, DOMCRF_dummy, 0); BUG_ON(dom_cow == NULL); /* First 1MB of RAM is historically marked as I/O. */ Index: xen-4.0.1-testing/xen/arch/x86/platform_hypercall.c =================================================================== --- xen-4.0.1-testing.orig/xen/arch/x86/platform_hypercall.c +++ xen-4.0.1-testing/xen/arch/x86/platform_hypercall.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -48,12 +49,12 @@ static DEFINE_PER_CPU(uint64_t, freq); extern int set_px_pminfo(uint32_t cpu, struct xen_processor_performance *perf); extern long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power); -static long cpu_frequency_change_helper(void *data) +static long cpu_frequency_change_helper(void *hdl, void *data) { return cpu_frequency_change(this_cpu(freq)); } -static long cpu_down_helper(void *data) +static long cpu_down_helper(void *hdl, void *data) { int cpu = (unsigned long)data; return cpu_down(cpu); @@ -314,7 +315,7 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe if ( op->u.change_freq.flags || !cpu_online(op->u.change_freq.cpu) ) break; per_cpu(freq, op->u.change_freq.cpu) = op->u.change_freq.freq; - ret = continue_hypercall_on_cpu(op->u.change_freq.cpu, + ret = continue_hypercall_on_cpu(op->u.change_freq.cpu, NULL, cpu_frequency_change_helper, NULL); break; @@ -406,7 +407,7 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe g_info = &op->u.pcpu_info; /* spin_trylock() avoids deadlock with stop_machine_run(). */ - if ( !spin_trylock(&cpu_add_remove_lock) ) + if ( !spin_trylock(&cpupool_lock) ) { ret = -EBUSY; break; @@ -429,7 +430,7 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe g_info->max_present = last_cpu(cpu_present_map); - spin_unlock(&cpu_add_remove_lock); + spin_unlock(&cpupool_lock); ret = copy_to_guest(u_xenpf_op, op, 1) ? -EFAULT : 0; } @@ -470,7 +471,7 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe break; } ret = continue_hypercall_on_cpu( - 0, cpu_down_helper, (void *)(unsigned long)cpu); + 0, NULL, cpu_down_helper, (void *)(unsigned long)cpu); break; } break; Index: xen-4.0.1-testing/xen/arch/x86/setup.c =================================================================== --- xen-4.0.1-testing.orig/xen/arch/x86/setup.c +++ xen-4.0.1-testing/xen/arch/x86/setup.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -245,7 +246,7 @@ static void __init init_idle_domain(void /* Domain creation requires that scheduler structures are initialised. */ scheduler_init(); - idle_domain = domain_create(IDLE_DOMAIN_ID, 0, 0); + idle_domain = domain_create(IDLE_DOMAIN_ID, CPUPOOLID_NONE, 0, 0); if ( idle_domain == NULL ) BUG(); idle_domain->vcpu = idle_vcpu; @@ -1122,8 +1123,13 @@ void __init __start_xen(unsigned long mb if ( !tboot_protect_mem_regions() ) panic("Could not protect TXT memory regions\n"); + /* Create initial cpupool 0. */ + cpupool0 = cpupool_create(0, NULL); + if ( (cpupool0 == NULL) || cpupool0_cpu_assign(cpupool0) ) + panic("Error creating cpupool 0\n"); + /* Create initial domain 0. */ - dom0 = domain_create(0, DOMCRF_s3_integrity, DOM0_SSIDREF); + dom0 = domain_create(0, 0, DOMCRF_s3_integrity, DOM0_SSIDREF); if ( (dom0 == NULL) || (alloc_dom0_vcpu0() == NULL) ) panic("Error creating domain 0\n"); Index: xen-4.0.1-testing/xen/arch/x86/smpboot.c =================================================================== --- xen-4.0.1-testing.orig/xen/arch/x86/smpboot.c +++ xen-4.0.1-testing/xen/arch/x86/smpboot.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -104,7 +105,6 @@ static void map_cpu_to_logical_apicid(vo DEFINE_PER_CPU(int, cpu_state) = { 0 }; void *stack_base[NR_CPUS]; -DEFINE_SPINLOCK(cpu_add_remove_lock); /* * The bootstrap kernel entry code has set these up. Save them for @@ -821,7 +821,7 @@ wakeup_secondary_cpu(int phys_apicid, un extern cpumask_t cpu_initialized; /* - * Caller should hold cpu_add_remove_lock if not called when booting + * Caller should hold cpupool_lock if not called when booting */ int alloc_cpu_id(void) { @@ -1306,10 +1306,11 @@ int __cpu_disable(void) __sync_lazy_execstate(); /* It's now safe to remove this processor from the online map */ + cpu_clear(cpu, cpupool0->cpu_valid); cpu_clear(cpu, cpu_online_map); fixup_irqs(); - cpu_disable_scheduler(); + cpu_disable_scheduler(cpu, 0); return 0; } @@ -1343,10 +1344,10 @@ int cpu_down(unsigned int cpu) int err = 0; /* spin_trylock() avoids deadlock with stop_machine_run(). */ - if (!spin_trylock(&cpu_add_remove_lock)) + if (!spin_trylock(&cpupool_lock)) return -EBUSY; - if (num_online_cpus() == 1) { + if ((!cpu_isset(cpu, cpupool0->cpu_valid)) || (cpus_weight(cpupool0->cpu_valid) == 1)) { err = -EBUSY; goto out; } @@ -1379,7 +1380,7 @@ int cpu_down(unsigned int cpu) out: if (!err) send_guest_global_virq(dom0, VIRQ_PCPU_STATE); - spin_unlock(&cpu_add_remove_lock); + spin_unlock(&cpupool_lock); return err; } @@ -1388,7 +1389,7 @@ int cpu_up(unsigned int cpu) int err = 0; /* spin_trylock() avoids deadlock with stop_machine_run(). */ - if (!spin_trylock(&cpu_add_remove_lock)) + if (!spin_trylock(&cpupool_lock)) return -EBUSY; if (cpu_online(cpu)) { @@ -1406,7 +1407,7 @@ int cpu_up(unsigned int cpu) out: if (!err) send_guest_global_virq(dom0, VIRQ_PCPU_STATE); - spin_unlock(&cpu_add_remove_lock); + spin_unlock(&cpupool_lock); return err; } @@ -1492,14 +1493,14 @@ int cpu_add(uint32_t apic_id, uint32_t a return -EEXIST; /* spin_trylock() avoids deadlock with stop_machine_run(). */ - if (!spin_trylock(&cpu_add_remove_lock)) + if (!spin_trylock(&cpupool_lock)) return -EBUSY; cpu = mp_register_lapic(apic_id, 1); if (cpu < 0) { - spin_unlock(&cpu_add_remove_lock); + spin_unlock(&cpupool_lock); return cpu; } @@ -1516,7 +1517,7 @@ int cpu_add(uint32_t apic_id, uint32_t a "Setup node failed for pxm %x\n", pxm); x86_acpiid_to_apicid[acpi_id] = 0xff; mp_unregister_lapic(apic_id, cpu); - spin_unlock(&cpu_add_remove_lock); + spin_unlock(&cpupool_lock); return node; } apicid_to_node[apic_id] = node; @@ -1524,7 +1525,7 @@ int cpu_add(uint32_t apic_id, uint32_t a srat_detect_node(cpu); numa_add_cpu(cpu); - spin_unlock(&cpu_add_remove_lock); + spin_unlock(&cpupool_lock); dprintk(XENLOG_INFO, "Add CPU %x with index %x\n", apic_id, cpu); return cpu; } @@ -1568,6 +1569,7 @@ int __devinit __cpu_up(unsigned int cpu) process_pending_softirqs(); } + cpupool_cpu_add(cpu); cpufreq_add_cpu(cpu); return 0; } Index: xen-4.0.1-testing/xen/arch/x86/sysctl.c =================================================================== --- xen-4.0.1-testing.orig/xen/arch/x86/sysctl.c +++ xen-4.0.1-testing/xen/arch/x86/sysctl.c @@ -29,7 +29,7 @@ #define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0) -static long cpu_down_helper(void *data) +static long cpu_down_helper(void *hdl, void *data) { int cpu = (unsigned long)data; return cpu_down(cpu); @@ -122,7 +122,7 @@ long arch_do_sysctl( break; case XEN_SYSCTL_CPU_HOTPLUG_OFFLINE: ret = continue_hypercall_on_cpu( - 0, cpu_down_helper, (void *)(unsigned long)cpu); + 0, NULL, cpu_down_helper, (void *)(unsigned long)cpu); break; case XEN_SYSCTL_CPU_HOTPLUG_STATUS: ret = 0; Index: xen-4.0.1-testing/xen/common/Makefile =================================================================== --- xen-4.0.1-testing.orig/xen/common/Makefile +++ xen-4.0.1-testing/xen/common/Makefile @@ -1,5 +1,6 @@ obj-y += bitmap.o obj-y += cpu.o +obj-y += cpupool.o obj-y += domctl.o obj-y += domain.o obj-y += event_channel.o Index: xen-4.0.1-testing/xen/common/cpupool.c =================================================================== --- /dev/null +++ xen-4.0.1-testing/xen/common/cpupool.c @@ -0,0 +1,585 @@ +/****************************************************************************** + * cpupool.c + * + * Generic cpupool-handling functions. + * + * Cpupools are a feature to have configurable scheduling domains. Each + * cpupool runs an own scheduler on a dedicated set of physical cpus. + * A domain is bound to one cpupool at any time, but it can be moved to + * another cpupool. + * + * (C) 2009, Juergen Gross, Fujitsu Technology Solutions + */ + +#include +#include +#include +#include +#include +#include + +#define for_each_cpupool(ptr) \ + for ((ptr) = &cpupool_list; *(ptr) != NULL; (ptr) = &((*(ptr))->next)) + +struct cpupool *cpupool0; /* Initial cpupool with Dom0 */ +cpumask_t cpupool_free_cpus; /* cpus not in any cpupool */ + +static struct cpupool *cpupool_list; /* linked list, sorted by poolid */ + +static int cpupool0_max_cpus; +integer_param("pool0_max_cpus", cpupool0_max_cpus); + +static int cpupool_moving_cpu = -1; +static struct cpupool *cpupool_cpu_moving = NULL; + +/* cpupool lock: be carefull, this lock is sometimes released on another cpu + * as it was obtained! + */ +DEFINE_SPINLOCK(cpupool_lock); + +DEFINE_PER_CPU(struct cpupool *, cpupool); + +static struct cpupool *alloc_cpupool_struct(void) +{ + return xmalloc(struct cpupool); +} + +static void free_cpupool_struct(struct cpupool *c) +{ + xfree(c); +} + +/* + * find a cpupool by it's id. to be called with cpupool lock held + * if exact is not specified, the first cpupool with an id larger or equal to + * the searched id is returned + * returns NULL if not found. + */ +static struct cpupool *cpupool_find_by_id(int id, int exact) +{ + struct cpupool **q; + + for_each_cpupool(q) + { + if ( (*q)->cpupool_id == id ) + return *q; + if ( (*q)->cpupool_id > id ) + break; + } + return exact ? NULL : *q; +} + +/* + * create a new cpupool with specified poolid and scheduler + * returns pointer to new cpupool structure if okay, NULL else + * possible failures: + * - no memory + * - poolid already used + * - unknown scheduler + */ +struct cpupool *cpupool_create(int poolid, char *sched) +{ + struct cpupool *c; + struct cpupool **q; + int last = 0; + + if ( (c = alloc_cpupool_struct()) == NULL ) + return NULL; + memset(c, 0, sizeof(*c)); + + printk(XENLOG_DEBUG "cpupool_create(pool=%d,sched=%s)\n", poolid, sched); + spin_lock(&cpupool_lock); + for_each_cpupool(q) + { + last = (*q)->cpupool_id; + if ( (poolid != CPUPOOLID_NONE) && (last >= poolid) ) + break; + } + if ( *q != NULL ) + { + if ( (*q)->cpupool_id == poolid ) + { + spin_unlock(&cpupool_lock); + free_cpupool_struct(c); + return NULL; + } + c->next = *q; + } + *q = c; + c->cpupool_id = (poolid == CPUPOOLID_NONE) ? (last + 1) : poolid; + if ( schedule_init_global(sched, &(c->sched)) ) + { + spin_unlock(&cpupool_lock); + cpupool_destroy(c); + return NULL; + } + spin_unlock(&cpupool_lock); + + printk("Created cpupool %d with scheduler %s (%s)\n", c->cpupool_id, + c->sched.name, c->sched.opt_name); + + return c; +} +/* + * destroys the given cpupool + * returns 0 on success, 1 else + * possible failures: + * - pool still in use + * - cpus still assigned to pool + * - pool not in list + */ +int cpupool_destroy(struct cpupool *c) +{ + struct cpupool **q; + + spin_lock(&cpupool_lock); + for_each_cpupool(q) + if ( *q == c ) + break; + if ( (*q != c) || (c->n_dom != 0) || cpus_weight(c->cpu_valid) ) + { + spin_unlock(&cpupool_lock); + return 1; + } + *q = c->next; + spin_unlock(&cpupool_lock); + printk(XENLOG_DEBUG "cpupool_destroy(pool=%d)\n", c->cpupool_id); + schedule_deinit_global(&(c->sched)); + free_cpupool_struct(c); + return 0; +} + +/* + * assign a specific cpu to a cpupool + * cpupool_lock must be held + */ +static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu) +{ + if ( (cpupool_moving_cpu == cpu) && (c != cpupool_cpu_moving) ) + return -EBUSY; + per_cpu(cpupool, cpu) = c; + schedule_cpu_switch(cpu, c); + cpu_clear(cpu, cpupool_free_cpus); + if (cpupool_moving_cpu == cpu) + { + cpupool_moving_cpu = -1; + cpupool_cpu_moving = NULL; + } + cpu_set(cpu, c->cpu_valid); + return 0; +} + +/* + * assign free physical cpus to a cpupool + * cpus assigned are unused cpus with lowest possible ids + * returns the number of cpus assigned + */ +int cpupool_assign_ncpu(struct cpupool *c, int ncpu) +{ + int i; + int n; + + n = 0; + spin_lock(&cpupool_lock); + for_each_cpu_mask(i, cpupool_free_cpus) + { + if ( cpupool_assign_cpu_locked(c, i) == 0 ) + n++; + if ( n == ncpu ) + break; + } + spin_unlock(&cpupool_lock); + printk(XENLOG_DEBUG "cpupool_assign_ncpu(pool=%d,ncpu=%d) rc %d\n", + c->cpupool_id, ncpu, n); + return n; +} + +static long cpupool_unassign_cpu_helper(void *hdl, void *info) +{ + struct cpupool *c = (struct cpupool *)info; + int cpu = cpupool_moving_cpu; + long ret; + int cpupool_id = c->cpupool_id; + + ret = cpu_disable_scheduler(cpu, 1); + cpu_set(cpu, cpupool_free_cpus); + if ( !ret ) + { + schedule_cpu_switch(cpu, NULL); + per_cpu(cpupool, cpu) = NULL; + cpupool_moving_cpu = -1; + cpupool_cpu_moving = NULL; + } + spin_unlock(&cpupool_lock); + printk(XENLOG_DEBUG "cpupool_unassign_cpu(pool=%d,cpu=%d) ret %ld\n", + cpupool_id, cpu, ret); + return ret; +} + +/* + * unassign a specific cpu from a cpupool + * we must be sure not to run on the cpu to be unassigned! to achieve this + * the main functionality is performed via continue_hypercall_on_cpu on a + * specific cpu. + * if the cpu to be removed is the last one of the cpupool no active domain + * must be bound to the cpupool. dying domains are moved to cpupool0 as they + * might be zombies. + * possible failures: + * - last cpu and still active domains in cpupool + */ +int cpupool_unassign_cpu(struct cpupool *c, unsigned int cpu) +{ + int work_cpu; + int ret; + struct domain *d; + int cpupool_id = c->cpupool_id; + + printk(XENLOG_DEBUG "cpupool_unassign_cpu(pool=%d,cpu=%d)\n", + cpupool_id, cpu); + spin_lock(&cpupool_lock); + ret = -EBUSY; + if ( (cpupool_moving_cpu != -1) && (cpu != cpupool_moving_cpu) ) + goto out; + + ret = 0; + if ( !cpu_isset(cpu, c->cpu_valid) && (cpu != cpupool_moving_cpu) ) + goto out; + + if ( (c->n_dom > 0) && (cpus_weight(c->cpu_valid) == 1) && + (cpu != cpupool_moving_cpu) ) + { + for_each_domain(d) + { + if ( d->cpupool != c ) + continue; + if ( !d->is_dying ) + { + ret = -EBUSY; + break; + } + c->n_dom--; + ret = sched_move_domain(d, cpupool0); + if ( ret ) + { + c->n_dom++; + break; + } + cpupool0->n_dom++; + } + if ( ret ) + goto out; + } + cpupool_moving_cpu = cpu; + cpupool_cpu_moving = c; + cpu_clear(cpu, c->cpu_valid); + work_cpu = smp_processor_id(); + if ( work_cpu == cpu ) + { + work_cpu = first_cpu(cpupool0->cpu_valid); + if ( work_cpu == cpu ) + work_cpu = next_cpu(cpu, cpupool0->cpu_valid); + } + return continue_hypercall_on_cpu(work_cpu, NULL, + cpupool_unassign_cpu_helper, c); + +out: + spin_unlock(&cpupool_lock); + printk(XENLOG_DEBUG "cpupool_unassign_cpu(pool=%d,cpu=%d) ret %d\n", + cpupool_id, cpu, ret); + return ret; +} + +/* + * assign cpus to the default cpupool + * default are all cpus, less cpus may be specified as boot parameter + * possible failures: + * - no cpu assigned + */ +int __init cpupool0_cpu_assign(struct cpupool *c) +{ + if ( (cpupool0_max_cpus == 0) || (cpupool0_max_cpus > num_online_cpus()) ) + cpupool0_max_cpus = num_online_cpus(); + if ( !cpupool_assign_ncpu(cpupool0, cpupool0_max_cpus) ) + return 1; + return 0; +} + +/* + * add a new domain to a cpupool + * possible failures: + * - pool does not exist + * - no cpu assigned to pool + */ +int cpupool_add_domain(struct domain *d, int poolid) +{ + struct cpupool *c; + int rc = 1; + int n_dom; + + if ( poolid == CPUPOOLID_NONE ) + return 0; + spin_lock(&cpupool_lock); + c = cpupool_find_by_id(poolid, 1); + if ( (c != NULL) && cpus_weight(c->cpu_valid) ) + { + c->n_dom++; + n_dom = c->n_dom; + d->cpupool = c; + rc = 0; + } + spin_unlock(&cpupool_lock); + if (!rc) + printk(XENLOG_DEBUG "cpupool_add_domain(dom=%d,pool=%d) n_dom %d\n", + d->domain_id, poolid, n_dom); + return rc; +} + +/* + * remove a domain from a cpupool + */ +void cpupool_rm_domain(struct domain *d) +{ + int cpupool_id; + int n_dom; + + if ( d->cpupool == NULL ) + return; + spin_lock(&cpupool_lock); + cpupool_id = d->cpupool->cpupool_id; + d->cpupool->n_dom--; + n_dom = d->cpupool->n_dom; + d->cpupool = NULL; + spin_unlock(&cpupool_lock); + printk(XENLOG_DEBUG "cpupool_rm_domain(dom=%d,pool=%d) n_dom %d\n", + d->domain_id, cpupool_id, n_dom); + return; +} + +/* + * called to add a new cpu to pool admin + * we add a hotplugged cpu to the cpupool0 to be able to add it to dom0 + */ +void cpupool_cpu_add(unsigned int cpu) +{ + if ( cpupool0 == NULL ) + return; + spin_lock(&cpupool_lock); + cpu_set(cpu, cpupool_free_cpus); + (void)cpupool_assign_cpu_locked(cpupool0, cpu); + spin_unlock(&cpupool_lock); + return; +} + +/* + * do cpupool related sysctl operations + */ +int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op) +{ + int ret; + struct cpupool *c; + + switch ( op->op ) + { + + case XEN_SYSCTL_CPUPOOL_OP_CREATE: + { + int poolid; + struct scheduler *sched; + + poolid = (op->cpupool_id == XEN_SYSCTL_CPUPOOL_PAR_ANY) ? + CPUPOOLID_NONE: op->cpupool_id; + sched = scheduler_get_by_id(op->sched_id); + ret = -ENOENT; + if ( sched == NULL ) + break; + ret = 0; + c = cpupool_create(poolid, sched->opt_name); + if ( c == NULL ) + ret = -EINVAL; + else + op->cpupool_id = c->cpupool_id; + } + break; + + case XEN_SYSCTL_CPUPOOL_OP_DESTROY: + { + spin_lock(&cpupool_lock); + c = cpupool_find_by_id(op->cpupool_id, 1); + spin_unlock(&cpupool_lock); + ret = -ENOENT; + if ( c == NULL ) + break; + ret = (cpupool_destroy(c) != 0) ? -EBUSY : 0; + } + break; + + case XEN_SYSCTL_CPUPOOL_OP_INFO: + { + spin_lock(&cpupool_lock); + c = cpupool_find_by_id(op->cpupool_id, 0); + spin_unlock(&cpupool_lock); + ret = -ENOENT; + if ( c == NULL ) + break; + op->cpupool_id = c->cpupool_id; + op->sched_id = c->sched.sched_id; + op->n_dom = c->n_dom; + cpumask_to_xenctl_cpumap(&(op->cpumap), &(c->cpu_valid)); + ret = 0; + } + break; + + case XEN_SYSCTL_CPUPOOL_OP_ADDCPU: + { + unsigned cpu; + + cpu = op->cpu; + printk(XENLOG_DEBUG "cpupool_assign_cpu(pool=%d,cpu=%d)\n", + op->cpupool_id, cpu); + spin_lock(&cpupool_lock); + if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY ) + cpu = first_cpu(cpupool_free_cpus); + ret = -EINVAL; + if ( cpu >= NR_CPUS ) + goto addcpu_out; + ret = -EBUSY; + if ( !cpu_isset(cpu, cpupool_free_cpus) ) + goto addcpu_out; + c = cpupool_find_by_id(op->cpupool_id, 0); + ret = -ENOENT; + if ( c == NULL ) + goto addcpu_out; + ret = cpupool_assign_cpu_locked(c, cpu); +addcpu_out: + spin_unlock(&cpupool_lock); + printk(XENLOG_DEBUG "cpupool_assign_cpu(pool=%d,cpu=%d) ret %d\n", + op->cpupool_id, cpu, ret); + } + break; + + case XEN_SYSCTL_CPUPOOL_OP_RMCPU: + { + unsigned cpu; + + spin_lock(&cpupool_lock); + c = cpupool_find_by_id(op->cpupool_id, 0); + spin_unlock(&cpupool_lock); + ret = -ENOENT; + if ( c == NULL ) + break; + cpu = op->cpu; + if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY ) + cpu = last_cpu(c->cpu_valid); + ret = -EINVAL; + if ( cpu >= NR_CPUS ) + break; + /* caution: cpupool_unassign_cpu uses continue_hypercall_on_cpu and + * will continue after the local return + */ + ret = cpupool_unassign_cpu(c, cpu); + } + break; + + case XEN_SYSCTL_CPUPOOL_OP_MOVEDOMAIN: + { + struct domain *d; + + ret = -EINVAL; + if ( op->domid == 0 ) + break; + ret = -ESRCH; + d = rcu_lock_domain_by_id(op->domid); + if ( d == NULL ) + break; + if ( d->cpupool == NULL ) + { + ret = -EINVAL; + rcu_unlock_domain(d); + break; + } + if ( op->cpupool_id == d->cpupool->cpupool_id ) + { + ret = 0; + rcu_unlock_domain(d); + break; + } + printk(XENLOG_DEBUG "cpupool move_domain(dom=%d)->pool=%d\n", + d->domain_id, op->cpupool_id); + ret = -ENOENT; + spin_lock(&cpupool_lock); + c = cpupool_find_by_id(op->cpupool_id, 1); + if ( (c != NULL) && cpus_weight(c->cpu_valid) ) + { + d->cpupool->n_dom--; + ret = sched_move_domain(d, c); + if ( ret ) + d->cpupool->n_dom++; + else + c->n_dom++; + } + spin_unlock(&cpupool_lock); + printk(XENLOG_DEBUG "cpupool move_domain(dom=%d)->pool=%d ret %d\n", + d->domain_id, op->cpupool_id, ret); + rcu_unlock_domain(d); + } + break; + + case XEN_SYSCTL_CPUPOOL_OP_FREEINFO: + { + cpumask_to_xenctl_cpumap(&(op->cpumap), + &cpupool_free_cpus); + ret = 0; + } + break; + + default: + ret = -ENOSYS; + + } + + return ret; +} + +void schedule_dump(struct cpupool *c); + +void dump_runq(unsigned char key) +{ + unsigned long flags; + s_time_t now = NOW(); + struct cpupool **c; + + spin_lock(&cpupool_lock); + local_irq_save(flags); + + printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now); + + printk("Idle cpupool:\n"); + schedule_dump(NULL); + + for_each_cpupool(c) + { + printk("Cpupool %d:\n", (*c)->cpupool_id); + schedule_dump(*c); + } + + local_irq_restore(flags); + spin_unlock(&cpupool_lock); +} + +static int __init cpupool_init(void) +{ + cpupool_free_cpus = cpu_online_map; + cpupool_list = NULL; + return 0; +} +__initcall(cpupool_init); + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ Index: xen-4.0.1-testing/xen/common/domain.c =================================================================== --- xen-4.0.1-testing.orig/xen/common/domain.c +++ xen-4.0.1-testing/xen/common/domain.c @@ -209,7 +209,7 @@ static void __init parse_extra_guest_irq custom_param("extra_guest_irqs", parse_extra_guest_irqs); struct domain *domain_create( - domid_t domid, unsigned int domcr_flags, ssidref_t ssidref) + domid_t domid, int poolid, unsigned int domcr_flags, ssidref_t ssidref) { struct domain *d, **pd; enum { INIT_xsm = 1u<<0, INIT_rangeset = 1u<<1, INIT_evtchn = 1u<<2, @@ -292,6 +292,9 @@ struct domain *domain_create( goto fail; init_status |= INIT_arch; + if ( cpupool_add_domain(d, poolid) != 0 ) + goto fail; + if ( sched_init_domain(d) != 0 ) goto fail; @@ -603,6 +606,8 @@ static void complete_domain_destroy(stru rangeset_domain_destroy(d); + cpupool_rm_domain(d); + sched_destroy_domain(d); /* Free page used by xen oprofile buffer. */ Index: xen-4.0.1-testing/xen/common/domctl.c =================================================================== --- xen-4.0.1-testing.orig/xen/common/domctl.c +++ xen-4.0.1-testing/xen/common/domctl.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -141,10 +142,12 @@ void getdomaininfo(struct domain *d, str info->shared_info_frame = mfn_to_gmfn(d, __pa(d->shared_info)>>PAGE_SHIFT); BUG_ON(SHARED_M2P(info->shared_info_frame)); + info->cpupool = d->cpupool ? d->cpupool->cpupool_id : CPUPOOLID_NONE; + memcpy(info->handle, d->handle, sizeof(xen_domain_handle_t)); } -static unsigned int default_vcpu0_location(void) +static unsigned int default_vcpu0_location(cpumask_t *online) { struct domain *d; struct vcpu *v; @@ -174,7 +177,7 @@ static unsigned int default_vcpu0_locati if ( cpus_weight(per_cpu(cpu_sibling_map, 0)) > 1 ) cpu = next_cpu(cpu, per_cpu(cpu_sibling_map, 0)); cpu_exclude_map = per_cpu(cpu_sibling_map, 0); - for_each_online_cpu ( i ) + for_each_cpu_mask(i, *online) { if ( cpu_isset(i, cpu_exclude_map) ) continue; @@ -389,6 +392,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc domid_t dom; static domid_t rover = 0; unsigned int domcr_flags; + int pool = 0; ret = -EINVAL; if ( supervisor_mode_kernel || @@ -432,7 +436,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc domcr_flags |= DOMCRF_oos_off; ret = -ENOMEM; - d = domain_create(dom, domcr_flags, op->u.createdomain.ssidref); + d = domain_create(dom, pool, domcr_flags, op->u.createdomain.ssidref); if ( d == NULL ) break; @@ -451,6 +455,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc { struct domain *d; unsigned int i, max = op->u.max_vcpus.max, cpu; + cpumask_t *online; ret = -ESRCH; if ( (d = rcu_lock_domain_by_id(op->domain)) == NULL ) @@ -499,6 +504,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc goto maxvcpu_out; ret = -ENOMEM; + online = (d->cpupool == NULL) ? &cpu_online_map : &d->cpupool->cpu_valid; if ( max > d->max_vcpus ) { struct vcpu **vcpus; @@ -522,8 +528,8 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc continue; cpu = (i == 0) ? - default_vcpu0_location() : - cycle_cpu(d->vcpu[i-1]->processor, cpu_online_map); + default_vcpu0_location(online) : + cycle_cpu(d->vcpu[i-1]->processor, *online); if ( alloc_vcpu(d, i, cpu) == NULL ) goto maxvcpu_out; Index: xen-4.0.1-testing/xen/common/kexec.c =================================================================== --- xen-4.0.1-testing.orig/xen/common/kexec.c +++ xen-4.0.1-testing/xen/common/kexec.c @@ -235,7 +235,7 @@ void kexec_crash(void) BUG(); } -static long kexec_reboot(void *_image) +static long kexec_reboot(void *hdl, void *_image) { xen_kexec_image_t *image = _image; @@ -584,7 +584,7 @@ static int kexec_exec(XEN_GUEST_HANDLE(v { case KEXEC_TYPE_DEFAULT: image = &kexec_image[base + pos]; - ret = continue_hypercall_on_cpu(0, kexec_reboot, image); + ret = continue_hypercall_on_cpu(0, NULL, kexec_reboot, image); break; case KEXEC_TYPE_CRASH: kexec_crash(); /* Does not return */ Index: xen-4.0.1-testing/xen/common/sched_credit.c =================================================================== --- xen-4.0.1-testing.orig/xen/common/sched_credit.c +++ xen-4.0.1-testing/xen/common/sched_credit.c @@ -70,11 +70,15 @@ /* * Useful macros */ +#define CSCHED_PRIV(_ops) \ + ((struct csched_private *)((_ops)->sched_data)) #define CSCHED_PCPU(_c) \ ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv) #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv) #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv) #define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq)) +#define CSCHED_CPUONLINE(_pool) \ + (((_pool) == NULL) ? &cpupool_free_cpus : &(_pool)->cpu_valid) /* @@ -160,10 +164,12 @@ struct csched_private { struct timer master_ticker; unsigned int master; cpumask_t idlers; + cpumask_t cpus; uint32_t weight; uint32_t credit; int credit_balance; uint32_t runq_sort; + int ticker_active; }; @@ -171,8 +177,10 @@ struct csched_private { * Global variables */ static struct csched_private csched_priv; +static struct csched_private *csched_priv0 = NULL; static void csched_tick(void *_cpu); +static void csched_acct(void *dummy); static inline int __vcpu_on_runq(struct csched_vcpu *svc) @@ -233,6 +241,7 @@ __runq_tickle(unsigned int cpu, struct c { struct csched_vcpu * const cur = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr); + struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu)); cpumask_t mask; ASSERT(cur); @@ -259,14 +268,14 @@ __runq_tickle(unsigned int cpu, struct c */ if ( cur->pri > CSCHED_PRI_IDLE ) { - if ( cpus_empty(csched_priv.idlers) ) + if ( cpus_empty(prv->idlers) ) { CSCHED_STAT_CRANK(tickle_idlers_none); } else { CSCHED_STAT_CRANK(tickle_idlers_some); - cpus_or(mask, mask, csched_priv.idlers); + cpus_or(mask, mask, prv->idlers); cpus_and(mask, mask, new->vcpu->cpu_affinity); } } @@ -276,40 +285,80 @@ __runq_tickle(unsigned int cpu, struct c cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ); } -static int -csched_pcpu_init(int cpu) +static void +csched_free_pdata(struct scheduler *ops, void *pcpu, int cpu) +{ + struct csched_private *prv = CSCHED_PRIV(ops); + struct csched_pcpu *spc = pcpu; + unsigned long flags; + + if ( spc == NULL ) + return; + + spin_lock_irqsave(&prv->lock, flags); + + prv->credit -= CSCHED_CREDITS_PER_ACCT; + prv->ncpus--; + cpu_clear(cpu, prv->idlers); + cpu_clear(cpu, prv->cpus); + if ( (prv->master == cpu) && (prv->ncpus > 0) ) + { + prv->master = first_cpu(prv->cpus); + migrate_timer(&prv->master_ticker, prv->master); + } + kill_timer(&spc->ticker); + if ( prv->ncpus == 0 ) + kill_timer(&prv->master_ticker); + + spin_unlock_irqrestore(&prv->lock, flags); + + xfree(spc); +} + +static void * +csched_alloc_pdata(struct scheduler *ops, int cpu) { struct csched_pcpu *spc; + struct csched_private *prv = CSCHED_PRIV(ops); unsigned long flags; /* Allocate per-PCPU info */ spc = xmalloc(struct csched_pcpu); if ( spc == NULL ) - return -1; + return NULL; memset(spc, 0, sizeof(*spc)); - spin_lock_irqsave(&csched_priv.lock, flags); + spin_lock_irqsave(&prv->lock, flags); /* Initialize/update system-wide config */ - csched_priv.credit += CSCHED_CREDITS_PER_ACCT; - if ( csched_priv.ncpus <= cpu ) - csched_priv.ncpus = cpu + 1; - if ( csched_priv.master >= csched_priv.ncpus ) - csched_priv.master = cpu; + prv->credit += CSCHED_CREDITS_PER_ACCT; + prv->ncpus++; + cpu_set(cpu, prv->cpus); + if ( (prv->ncpus == 1) && (prv != csched_priv0) ) + { + prv->master = cpu; + init_timer( &prv->master_ticker, csched_acct, prv, cpu); + prv->ticker_active = 2; + } init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu); + + if ( prv == csched_priv0 ) + prv->master = first_cpu(prv->cpus); + INIT_LIST_HEAD(&spc->runq); - spc->runq_sort_last = csched_priv.runq_sort; + spc->runq_sort_last = prv->runq_sort; spc->idle_bias = NR_CPUS - 1; - per_cpu(schedule_data, cpu).sched_priv = spc; + if ( per_cpu(schedule_data, cpu).sched_priv == NULL ) + per_cpu(schedule_data, cpu).sched_priv = spc; /* Start off idling... */ BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr)); - cpu_set(cpu, csched_priv.idlers); + cpu_set(cpu, prv->idlers); - spin_unlock_irqrestore(&csched_priv.lock, flags); + spin_unlock_irqrestore(&prv->lock, flags); - return 0; + return spc; } #ifndef NDEBUG @@ -382,17 +431,19 @@ __csched_vcpu_is_migrateable(struct vcpu } static int -_csched_cpu_pick(struct vcpu *vc, bool_t commit) +_csched_cpu_pick(struct scheduler *ops, struct vcpu *vc, bool_t commit) { cpumask_t cpus; cpumask_t idlers; + cpumask_t *online; int cpu; /* * Pick from online CPUs in VCPU's affinity mask, giving a * preference to its current processor if it's in there. */ - cpus_and(cpus, cpu_online_map, vc->cpu_affinity); + online = CSCHED_CPUONLINE(vc->domain->cpupool); + cpus_and(cpus, *online, vc->cpu_affinity); cpu = cpu_isset(vc->processor, cpus) ? vc->processor : cycle_cpu(vc->processor, cpus); @@ -410,7 +461,7 @@ _csched_cpu_pick(struct vcpu *vc, bool_t * like run two VCPUs on co-hyperthreads while there are idle cores * or sockets. */ - cpus_and(idlers, cpu_online_map, csched_priv.idlers); + cpus_and(idlers, cpu_online_map, CSCHED_PRIV(ops)->idlers); cpu_set(cpu, idlers); cpus_and(cpus, cpus, idlers); cpu_clear(cpu, cpus); @@ -456,18 +507,18 @@ _csched_cpu_pick(struct vcpu *vc, bool_t } static int -csched_cpu_pick(struct vcpu *vc) +csched_cpu_pick(struct scheduler *ops, struct vcpu *vc) { - return _csched_cpu_pick(vc, 1); + return _csched_cpu_pick(ops, vc, 1); } static inline void -__csched_vcpu_acct_start(struct csched_vcpu *svc) +__csched_vcpu_acct_start(struct csched_private *prv, struct csched_vcpu *svc) { struct csched_dom * const sdom = svc->sdom; unsigned long flags; - spin_lock_irqsave(&csched_priv.lock, flags); + spin_lock_irqsave(&prv->lock, flags); if ( list_empty(&svc->active_vcpu_elem) ) { @@ -478,16 +529,17 @@ __csched_vcpu_acct_start(struct csched_v list_add(&svc->active_vcpu_elem, &sdom->active_vcpu); if ( list_empty(&sdom->active_sdom_elem) ) { - list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom); - csched_priv.weight += sdom->weight; + list_add(&sdom->active_sdom_elem, &prv->active_sdom); + prv->weight += sdom->weight; } } - spin_unlock_irqrestore(&csched_priv.lock, flags); + spin_unlock_irqrestore(&prv->lock, flags); } static inline void -__csched_vcpu_acct_stop_locked(struct csched_vcpu *svc) +__csched_vcpu_acct_stop_locked(struct csched_private *prv, + struct csched_vcpu *svc) { struct csched_dom * const sdom = svc->sdom; @@ -500,16 +552,17 @@ __csched_vcpu_acct_stop_locked(struct cs list_del_init(&svc->active_vcpu_elem); if ( list_empty(&sdom->active_vcpu) ) { - BUG_ON( csched_priv.weight < sdom->weight ); + BUG_ON( prv->weight < sdom->weight ); list_del_init(&sdom->active_sdom_elem); - csched_priv.weight -= sdom->weight; + prv->weight -= sdom->weight; } } static void -csched_vcpu_acct(unsigned int cpu) +csched_vcpu_acct(struct csched_private *prv, unsigned int cpu) { struct csched_vcpu * const svc = CSCHED_VCPU(current); + struct scheduler *ops = per_cpu(scheduler, cpu); ASSERT( current->processor == cpu ); ASSERT( svc->sdom != NULL ); @@ -538,9 +591,9 @@ csched_vcpu_acct(unsigned int cpu) */ if ( list_empty(&svc->active_vcpu_elem) ) { - __csched_vcpu_acct_start(svc); + __csched_vcpu_acct_start(prv, svc); } - else if ( _csched_cpu_pick(current, 0) != cpu ) + else if ( _csched_cpu_pick(ops, current, 0) != cpu ) { CSCHED_VCPU_STAT_CRANK(svc, migrate_r); CSCHED_STAT_CRANK(migrate_running); @@ -549,66 +602,75 @@ csched_vcpu_acct(unsigned int cpu) } } -static int -csched_vcpu_init(struct vcpu *vc) +static void * +csched_alloc_vdata(struct scheduler *ops, struct vcpu *vc, void *dd) { - struct domain * const dom = vc->domain; - struct csched_dom *sdom = CSCHED_DOM(dom); struct csched_vcpu *svc; - CSCHED_STAT_CRANK(vcpu_init); - /* Allocate per-VCPU info */ svc = xmalloc(struct csched_vcpu); if ( svc == NULL ) - return -1; + return NULL; memset(svc, 0, sizeof(*svc)); INIT_LIST_HEAD(&svc->runq_elem); INIT_LIST_HEAD(&svc->active_vcpu_elem); - svc->sdom = sdom; + svc->sdom = dd; svc->vcpu = vc; atomic_set(&svc->credit, 0); svc->flags = 0U; - svc->pri = is_idle_domain(dom) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER; + svc->pri = is_idle_domain(vc->domain) ? + CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER; CSCHED_VCPU_STATS_RESET(svc); - vc->sched_priv = svc; + CSCHED_STAT_CRANK(vcpu_init); + return svc; +} - /* Allocate per-PCPU info */ - if ( unlikely(!CSCHED_PCPU(vc->processor)) ) - { - if ( csched_pcpu_init(vc->processor) != 0 ) - return -1; - } +static void +csched_vcpu_insert(struct scheduler *ops, struct vcpu *vc) +{ + struct csched_vcpu *svc = vc->sched_priv; - CSCHED_VCPU_CHECK(vc); - return 0; + if ( !__vcpu_on_runq(svc) && vcpu_runnable(vc) && !vc->is_running ) + __runq_insert(vc->processor, svc); } static void -csched_vcpu_destroy(struct vcpu *vc) +csched_free_vdata(struct scheduler *ops, void *priv) { - struct csched_vcpu * const svc = CSCHED_VCPU(vc); - struct csched_dom * const sdom = svc->sdom; + struct csched_private *prv = CSCHED_PRIV(ops); + struct csched_vcpu *svc = priv; unsigned long flags; - CSCHED_STAT_CRANK(vcpu_destroy); - - BUG_ON( sdom == NULL ); - BUG_ON( !list_empty(&svc->runq_elem) ); + if ( __vcpu_on_runq(svc) ) + __runq_remove(svc); - spin_lock_irqsave(&csched_priv.lock, flags); + spin_lock_irqsave(&(prv->lock), flags); if ( !list_empty(&svc->active_vcpu_elem) ) - __csched_vcpu_acct_stop_locked(svc); + __csched_vcpu_acct_stop_locked(prv, svc); - spin_unlock_irqrestore(&csched_priv.lock, flags); + spin_unlock_irqrestore(&(prv->lock), flags); xfree(svc); } static void -csched_vcpu_sleep(struct vcpu *vc) +csched_vcpu_destroy(struct scheduler *ops, struct vcpu *vc) +{ + struct csched_vcpu * const svc = CSCHED_VCPU(vc); + struct csched_dom * const sdom = svc->sdom; + + CSCHED_STAT_CRANK(vcpu_destroy); + + BUG_ON( sdom == NULL ); + BUG_ON( !list_empty(&svc->runq_elem) ); + + csched_free_vdata(ops, svc); +} + +static void +csched_vcpu_sleep(struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); @@ -623,7 +685,7 @@ csched_vcpu_sleep(struct vcpu *vc) } static void -csched_vcpu_wake(struct vcpu *vc) +csched_vcpu_wake(struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); const unsigned int cpu = vc->processor; @@ -679,10 +741,12 @@ csched_vcpu_wake(struct vcpu *vc) static int csched_dom_cntl( + struct scheduler *ops, struct domain *d, struct xen_domctl_scheduler_op *op) { struct csched_dom * const sdom = CSCHED_DOM(d); + struct csched_private *prv = CSCHED_PRIV(ops); unsigned long flags; if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo ) @@ -694,14 +758,14 @@ csched_dom_cntl( { ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo); - spin_lock_irqsave(&csched_priv.lock, flags); + spin_lock_irqsave(&prv->lock, flags); if ( op->u.credit.weight != 0 ) { if ( !list_empty(&sdom->active_sdom_elem) ) { - csched_priv.weight -= sdom->weight; - csched_priv.weight += op->u.credit.weight; + prv->weight -= sdom->weight; + prv->weight += op->u.credit.weight; } sdom->weight = op->u.credit.weight; } @@ -709,25 +773,20 @@ csched_dom_cntl( if ( op->u.credit.cap != (uint16_t)~0U ) sdom->cap = op->u.credit.cap; - spin_unlock_irqrestore(&csched_priv.lock, flags); + spin_unlock_irqrestore(&prv->lock, flags); } return 0; } -static int -csched_dom_init(struct domain *dom) +static void * +csched_alloc_domdata(struct scheduler *ops, struct domain *dom) { struct csched_dom *sdom; - CSCHED_STAT_CRANK(dom_init); - - if ( is_idle_domain(dom) ) - return 0; - sdom = xmalloc(struct csched_dom); if ( sdom == NULL ) - return -ENOMEM; + return NULL; memset(sdom, 0, sizeof(*sdom)); /* Initialize credit and weight */ @@ -737,16 +796,40 @@ csched_dom_init(struct domain *dom) sdom->dom = dom; sdom->weight = CSCHED_DEFAULT_WEIGHT; sdom->cap = 0U; + + return (void *)sdom; +} + +static int +csched_dom_init(struct scheduler *ops, struct domain *dom) +{ + struct csched_dom *sdom; + + CSCHED_STAT_CRANK(dom_init); + + if ( is_idle_domain(dom) ) + return 0; + + sdom = csched_alloc_domdata(ops, dom); + if ( sdom == NULL ) + return -ENOMEM; + dom->sched_priv = sdom; return 0; } static void -csched_dom_destroy(struct domain *dom) +csched_free_domdata(struct scheduler *ops, void *data) +{ + xfree(data); +} + +static void +csched_dom_destroy(struct scheduler *ops, struct domain *dom) { CSCHED_STAT_CRANK(dom_destroy); - xfree(CSCHED_DOM(dom)); + csched_free_domdata(ops, CSCHED_DOM(dom)); } /* @@ -757,7 +840,7 @@ csched_dom_destroy(struct domain *dom) * remember the last UNDER to make the move up operation O(1). */ static void -csched_runq_sort(unsigned int cpu) +csched_runq_sort(struct csched_private *prv, unsigned int cpu) { struct csched_pcpu * const spc = CSCHED_PCPU(cpu); struct list_head *runq, *elem, *next, *last_under; @@ -765,7 +848,7 @@ csched_runq_sort(unsigned int cpu) unsigned long flags; int sort_epoch; - sort_epoch = csched_priv.runq_sort; + sort_epoch = prv->runq_sort; if ( sort_epoch == spc->runq_sort_last ) return; @@ -802,6 +885,7 @@ csched_runq_sort(unsigned int cpu) static void csched_acct(void* dummy) { + struct csched_private *prv = dummy; unsigned long flags; struct list_head *iter_vcpu, *next_vcpu; struct list_head *iter_sdom, *next_sdom; @@ -818,22 +902,22 @@ csched_acct(void* dummy) int credit; - spin_lock_irqsave(&csched_priv.lock, flags); + spin_lock_irqsave(&prv->lock, flags); - weight_total = csched_priv.weight; - credit_total = csched_priv.credit; + weight_total = prv->weight; + credit_total = prv->credit; /* Converge balance towards 0 when it drops negative */ - if ( csched_priv.credit_balance < 0 ) + if ( prv->credit_balance < 0 ) { - credit_total -= csched_priv.credit_balance; + credit_total -= prv->credit_balance; CSCHED_STAT_CRANK(acct_balance); } if ( unlikely(weight_total == 0) ) { - csched_priv.credit_balance = 0; - spin_unlock_irqrestore(&csched_priv.lock, flags); + prv->credit_balance = 0; + spin_unlock_irqrestore(&prv->lock, flags); CSCHED_STAT_CRANK(acct_no_work); goto out; } @@ -845,7 +929,7 @@ csched_acct(void* dummy) credit_xtra = 0; credit_cap = 0U; - list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom ) + list_for_each_safe( iter_sdom, next_sdom, &prv->active_sdom ) { sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem); @@ -865,9 +949,9 @@ csched_acct(void* dummy) * only when the system-wide credit balance is negative. */ credit_peak = sdom->active_vcpu_count * CSCHED_CREDITS_PER_ACCT; - if ( csched_priv.credit_balance < 0 ) + if ( prv->credit_balance < 0 ) { - credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) + + credit_peak += ( ( -prv->credit_balance * sdom->weight) + (weight_total - 1) ) / weight_total; } @@ -909,7 +993,7 @@ csched_acct(void* dummy) */ CSCHED_STAT_CRANK(acct_reorder); list_del(&sdom->active_sdom_elem); - list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom); + list_add(&sdom->active_sdom_elem, &prv->active_sdom); } credit_fair = credit_peak; @@ -975,7 +1059,7 @@ csched_acct(void* dummy) /* Upper bound on credits means VCPU stops earning */ if ( credit > CSCHED_CREDITS_PER_TSLICE ) { - __csched_vcpu_acct_stop_locked(svc); + __csched_vcpu_acct_stop_locked(prv, svc); credit = 0; atomic_set(&svc->credit, credit); } @@ -987,15 +1071,15 @@ csched_acct(void* dummy) } } - csched_priv.credit_balance = credit_balance; + prv->credit_balance = credit_balance; - spin_unlock_irqrestore(&csched_priv.lock, flags); + spin_unlock_irqrestore(&prv->lock, flags); /* Inform each CPU that its runq needs to be sorted */ - csched_priv.runq_sort++; + prv->runq_sort++; out: - set_timer( &csched_priv.master_ticker, NOW() + + set_timer( &prv->master_ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT ); } @@ -1004,6 +1088,7 @@ csched_tick(void *_cpu) { unsigned int cpu = (unsigned long)_cpu; struct csched_pcpu *spc = CSCHED_PCPU(cpu); + struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu)); spc->tick++; @@ -1011,7 +1096,7 @@ csched_tick(void *_cpu) * Accounting for running VCPU */ if ( !is_idle_vcpu(current) ) - csched_vcpu_acct(cpu); + csched_vcpu_acct(prv, cpu); /* * Check if runq needs to be sorted @@ -1020,7 +1105,7 @@ csched_tick(void *_cpu) * modified priorities. This is a special O(n) sort and runs at most * once per accounting period (currently 30 milliseconds). */ - csched_runq_sort(cpu); + csched_runq_sort(prv, cpu); set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK)); } @@ -1073,16 +1158,19 @@ csched_runq_steal(int peer_cpu, int cpu, } static struct csched_vcpu * -csched_load_balance(int cpu, struct csched_vcpu *snext) +csched_load_balance(struct csched_private *prv, int cpu, + struct csched_vcpu *snext) { struct csched_vcpu *speer; cpumask_t workers; + cpumask_t *online; int peer_cpu; BUG_ON( cpu != snext->vcpu->processor ); + online = CSCHED_CPUONLINE(per_cpu(cpupool, cpu)); /* If this CPU is going offline we shouldn't steal work. */ - if ( unlikely(!cpu_online(cpu)) ) + if ( unlikely(!cpu_isset(cpu, *online)) ) goto out; if ( snext->pri == CSCHED_PRI_IDLE ) @@ -1096,7 +1184,7 @@ csched_load_balance(int cpu, struct csch * Peek at non-idling CPUs in the system, starting with our * immediate neighbour. */ - cpus_andnot(workers, cpu_online_map, csched_priv.idlers); + cpus_andnot(workers, *online, prv->idlers); cpu_clear(cpu, workers); peer_cpu = cpu; @@ -1138,11 +1226,12 @@ csched_load_balance(int cpu, struct csch * fast for the common case. */ static struct task_slice -csched_schedule(s_time_t now) +csched_schedule(struct scheduler *ops, s_time_t now) { const int cpu = smp_processor_id(); struct list_head * const runq = RUNQ(cpu); struct csched_vcpu * const scurr = CSCHED_VCPU(current); + struct csched_private *prv = CSCHED_PRIV(ops); struct csched_vcpu *snext; struct task_slice ret; @@ -1177,7 +1266,7 @@ csched_schedule(s_time_t now) if ( snext->pri > CSCHED_PRI_TS_OVER ) __runq_remove(snext); else - snext = csched_load_balance(cpu, snext); + snext = csched_load_balance(prv, cpu, snext); /* * Update idlers mask if necessary. When we're idling, other CPUs @@ -1185,12 +1274,12 @@ csched_schedule(s_time_t now) */ if ( snext->pri == CSCHED_PRI_IDLE ) { - if ( !cpu_isset(cpu, csched_priv.idlers) ) - cpu_set(cpu, csched_priv.idlers); + if ( !cpu_isset(cpu, prv->idlers) ) + cpu_set(cpu, prv->idlers); } - else if ( cpu_isset(cpu, csched_priv.idlers) ) + else if ( cpu_isset(cpu, prv->idlers) ) { - cpu_clear(cpu, csched_priv.idlers); + cpu_clear(cpu, prv->idlers); } if ( !is_idle_vcpu(snext->vcpu) ) @@ -1237,7 +1326,7 @@ csched_dump_vcpu(struct csched_vcpu *svc } static void -csched_dump_pcpu(int cpu) +csched_dump_pcpu(struct scheduler *ops, int cpu) { struct list_head *runq, *iter; struct csched_pcpu *spc; @@ -1275,9 +1364,10 @@ csched_dump_pcpu(int cpu) } static void -csched_dump(void) +csched_dump(struct scheduler *ops) { struct list_head *iter_sdom, *iter_svc; + struct csched_private *prv = CSCHED_PRIV(ops); int loop; #define idlers_buf keyhandler_scratch @@ -1294,12 +1384,12 @@ csched_dump(void) "\tticks per tslice = %d\n" "\tticks per acct = %d\n" "\tmigration delay = %uus\n", - csched_priv.ncpus, - csched_priv.master, - csched_priv.credit, - csched_priv.credit_balance, - csched_priv.weight, - csched_priv.runq_sort, + prv->ncpus, + prv->master, + prv->credit, + prv->credit_balance, + prv->weight, + prv->runq_sort, CSCHED_DEFAULT_WEIGHT, CSCHED_MSECS_PER_TICK, CSCHED_CREDITS_PER_MSEC, @@ -1307,12 +1397,12 @@ csched_dump(void) CSCHED_TICKS_PER_ACCT, vcpu_migration_delay); - cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), csched_priv.idlers); + cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), prv->idlers); printk("idlers: %s\n", idlers_buf); printk("active vcpus:\n"); loop = 0; - list_for_each( iter_sdom, &csched_priv.active_sdom ) + list_for_each( iter_sdom, &prv->active_sdom ) { struct csched_dom *sdom; sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem); @@ -1329,18 +1419,30 @@ csched_dump(void) #undef idlers_buf } -static void -csched_init(void) +static int +csched_init(struct scheduler *ops) { - spin_lock_init(&csched_priv.lock); - INIT_LIST_HEAD(&csched_priv.active_sdom); - csched_priv.ncpus = 0; - csched_priv.master = UINT_MAX; - cpus_clear(csched_priv.idlers); - csched_priv.weight = 0U; - csched_priv.credit = 0U; - csched_priv.credit_balance = 0; - csched_priv.runq_sort = 0U; + struct csched_private *prv; + + prv = xmalloc(struct csched_private); + if ( prv == NULL ) + return 1; + memset(prv, 0, sizeof(*prv)); + if (csched_priv0 == NULL) + csched_priv0 = prv; + ops->sched_data = prv; + spin_lock_init(&prv->lock); + INIT_LIST_HEAD(&prv->active_sdom); + prv->ncpus = 0; + prv->master = UINT_MAX; + cpus_clear(prv->idlers); + prv->weight = 0U; + prv->credit = 0U; + prv->credit_balance = 0; + prv->runq_sort = 0U; + prv->ticker_active = (csched_priv0 == prv) ? 0 : 1; + + return 0; } /* Tickers cannot be kicked until SMP subsystem is alive. */ @@ -1350,54 +1452,81 @@ static __init int csched_start_tickers(v unsigned int cpu; /* Is the credit scheduler initialised? */ - if ( csched_priv.ncpus == 0 ) + if ( (csched_priv0 == NULL) || (csched_priv0->ncpus == 0) ) return 0; + csched_priv0->ticker_active = 1; + for_each_online_cpu ( cpu ) { spc = CSCHED_PCPU(cpu); set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK)); } - init_timer( &csched_priv.master_ticker, csched_acct, NULL, - csched_priv.master); + init_timer( &csched_priv0->master_ticker, csched_acct, csched_priv0, + csched_priv0->master); - set_timer( &csched_priv.master_ticker, NOW() + + set_timer( &csched_priv0->master_ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT ); return 0; } __initcall(csched_start_tickers); -static void csched_tick_suspend(void) +static void +csched_deinit(struct scheduler *ops) +{ + struct csched_private *prv; + + prv = CSCHED_PRIV(ops); + if ( prv != NULL ) + xfree(prv); +} + +static void csched_tick_suspend(struct scheduler *ops, unsigned int cpu) { struct csched_pcpu *spc; - spc = CSCHED_PCPU(smp_processor_id()); + spc = CSCHED_PCPU(cpu); stop_timer(&spc->ticker); } -static void csched_tick_resume(void) +static void csched_tick_resume(struct scheduler *ops, unsigned int cpu) { struct csched_pcpu *spc; uint64_t now = NOW(); + struct csched_private *prv; + + prv = CSCHED_PRIV(ops); + if ( !prv->ticker_active ) + return; - spc = CSCHED_PCPU(smp_processor_id()); + + spc = CSCHED_PCPU(cpu); set_timer(&spc->ticker, now + MILLISECS(CSCHED_MSECS_PER_TICK) - now % MILLISECS(CSCHED_MSECS_PER_TICK) ); + + if ( (prv->ticker_active == 2) && (prv->master == cpu) ) + { + set_timer( &prv->master_ticker, now + + MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT - + now % MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT); + prv->ticker_active = 1; + } } -const struct scheduler sched_credit_def = { +struct scheduler sched_credit_def = { .name = "SMP Credit Scheduler", .opt_name = "credit", .sched_id = XEN_SCHEDULER_CREDIT, + .sched_data = &csched_priv, .init_domain = csched_dom_init, .destroy_domain = csched_dom_destroy, - .init_vcpu = csched_vcpu_init, + .insert_vcpu = csched_vcpu_insert, .destroy_vcpu = csched_vcpu_destroy, .sleep = csched_vcpu_sleep, @@ -1411,6 +1540,13 @@ const struct scheduler sched_credit_def .dump_cpu_state = csched_dump_pcpu, .dump_settings = csched_dump, .init = csched_init, + .deinit = csched_deinit, + .alloc_vdata = csched_alloc_vdata, + .free_vdata = csched_free_vdata, + .alloc_pdata = csched_alloc_pdata, + .free_pdata = csched_free_pdata, + .alloc_domdata = csched_alloc_domdata, + .free_domdata = csched_free_domdata, .tick_suspend = csched_tick_suspend, .tick_resume = csched_tick_resume, Index: xen-4.0.1-testing/xen/common/sched_sedf.c =================================================================== --- xen-4.0.1-testing.orig/xen/common/sched_sedf.c +++ xen-4.0.1-testing/xen/common/sched_sedf.c @@ -21,6 +21,9 @@ printk(_a ); \ } while ( 0 ) +#define SEDF_CPUONLINE(_pool) \ + (((_pool) == NULL) ? &cpupool_free_cpus : &(_pool)->cpu_valid) + #ifndef NDEBUG #define SEDF_STATS #define CHECK(_p) \ @@ -132,7 +135,7 @@ struct sedf_cpu_info { #define sedf_runnable(edom) (!(EDOM_INFO(edom)->status & SEDF_ASLEEP)) -static void sedf_dump_cpu_state(int i); +static void sedf_dump_cpu_state(struct scheduler *ops, int i); static inline int extraq_on(struct vcpu *d, int i) { @@ -329,30 +332,17 @@ static inline void __add_to_runqueue_sor } -static int sedf_init_vcpu(struct vcpu *v) +static void *sedf_alloc_vdata(struct scheduler *ops, struct vcpu *v, void *dd) { struct sedf_vcpu_info *inf; - if ( (v->sched_priv = xmalloc(struct sedf_vcpu_info)) == NULL ) - return -1; - memset(v->sched_priv, 0, sizeof(struct sedf_vcpu_info)); + inf = xmalloc(struct sedf_vcpu_info); + if ( inf == NULL ) + return NULL; - inf = EDOM_INFO(v); + memset(inf, 0, sizeof(struct sedf_vcpu_info)); inf->vcpu = v; - - /* Allocate per-CPU context if this is the first domain to be added. */ - if ( unlikely(per_cpu(schedule_data, v->processor).sched_priv == NULL) ) - { - per_cpu(schedule_data, v->processor).sched_priv = - xmalloc(struct sedf_cpu_info); - BUG_ON(per_cpu(schedule_data, v->processor).sched_priv == NULL); - memset(CPU_INFO(v->processor), 0, sizeof(*CPU_INFO(v->processor))); - INIT_LIST_HEAD(WAITQ(v->processor)); - INIT_LIST_HEAD(RUNQ(v->processor)); - INIT_LIST_HEAD(EXTRAQ(v->processor,EXTRA_PEN_Q)); - INIT_LIST_HEAD(EXTRAQ(v->processor,EXTRA_UTIL_Q)); - } - + /* Every VCPU gets an equal share of extratime by default. */ inf->deadl_abs = 0; inf->latency = 0; @@ -383,39 +373,88 @@ static int sedf_init_vcpu(struct vcpu *v } else { - EDOM_INFO(v)->deadl_abs = 0; - EDOM_INFO(v)->status &= ~SEDF_ASLEEP; + inf->deadl_abs = 0; + inf->status &= ~SEDF_ASLEEP; } - return 0; + return inf; +} + +static void * +sedf_alloc_pdata(struct scheduler *ops, int cpu) +{ + struct sedf_cpu_info *spc; + + spc = xmalloc(struct sedf_cpu_info); + BUG_ON(spc == NULL); + memset(spc, 0, sizeof(*spc)); + INIT_LIST_HEAD(&spc->waitq); + INIT_LIST_HEAD(&spc->runnableq); + INIT_LIST_HEAD(&spc->extraq[EXTRA_PEN_Q]); + INIT_LIST_HEAD(&spc->extraq[EXTRA_UTIL_Q]); + + return (void *)spc; +} + +static void +sedf_free_pdata(struct scheduler *ops, void *spc, int cpu) +{ + if ( spc == NULL ) + return; + + xfree(spc); +} + +static void sedf_free_vdata(struct scheduler *ops, void *priv) +{ + xfree(priv); } -static void sedf_destroy_vcpu(struct vcpu *v) +static void sedf_destroy_vcpu(struct scheduler *ops, struct vcpu *v) { - xfree(v->sched_priv); + sedf_free_vdata(ops, v->sched_priv); } -static int sedf_init_domain(struct domain *d) +static void * +sedf_alloc_domdata(struct scheduler *ops, struct domain *d) { - d->sched_priv = xmalloc(struct sedf_dom_info); + void *mem; + + mem = xmalloc(struct sedf_dom_info); + if ( mem == NULL ) + return NULL; + + memset(mem, 0, sizeof(struct sedf_dom_info)); + + return mem; +} + +static int sedf_init_domain(struct scheduler *ops, struct domain *d) +{ + d->sched_priv = sedf_alloc_domdata(ops, d); if ( d->sched_priv == NULL ) return -ENOMEM; - memset(d->sched_priv, 0, sizeof(struct sedf_dom_info)); - return 0; } -static void sedf_destroy_domain(struct domain *d) +static void sedf_free_domdata(struct scheduler *ops, void *data) +{ + xfree(data); +} + +static void sedf_destroy_domain(struct scheduler *ops, struct domain *d) { - xfree(d->sched_priv); + sedf_free_domdata(ops, d->sched_priv); } -static int sedf_pick_cpu(struct vcpu *v) +static int sedf_pick_cpu(struct scheduler *ops, struct vcpu *v) { cpumask_t online_affinity; + cpumask_t *online; - cpus_and(online_affinity, v->cpu_affinity, cpu_online_map); + online = SEDF_CPUONLINE(v->domain->cpupool); + cpus_and(online_affinity, v->cpu_affinity, *online); return first_cpu(online_affinity); } @@ -751,7 +790,7 @@ static struct task_slice sedf_do_extra_s -timeslice for the current period used up -domain on waitqueue has started it's period -and various others ;) in general: determine which domain to run next*/ -static struct task_slice sedf_do_schedule(s_time_t now) +static struct task_slice sedf_do_schedule(struct scheduler *ops, s_time_t now) { int cpu = smp_processor_id(); struct list_head *runq = RUNQ(cpu); @@ -786,6 +825,13 @@ static struct task_slice sedf_do_schedul } check_waitq: update_queues(now, runq, waitq); + + if ( unlikely(!cpu_isset(cpu, *SEDF_CPUONLINE(per_cpu(cpupool, cpu)))) ) + { + ret.task = IDLETASK(cpu); + ret.time = SECONDS(1); + goto sched_done; + } /*now simply pick the first domain from the runqueue, which has the earliest deadline, because the list is sorted*/ @@ -848,7 +894,7 @@ static struct task_slice sedf_do_schedul } -static void sedf_sleep(struct vcpu *d) +static void sedf_sleep(struct scheduler *ops, struct vcpu *d) { PRINT(2,"sedf_sleep was called, domain-id %i.%i\n", d->domain->domain_id, d->vcpu_id); @@ -1067,7 +1113,7 @@ static inline int should_switch(struct v return 1; } -static void sedf_wake(struct vcpu *d) +static void sedf_wake(struct scheduler *ops, struct vcpu *d) { s_time_t now = NOW(); struct sedf_vcpu_info* inf = EDOM_INFO(d); @@ -1220,8 +1266,8 @@ static void sedf_dump_domain(struct vcpu } -/* dumps all domains on hte specified cpu */ -static void sedf_dump_cpu_state(int i) +/* dumps all domains on the specified cpu */ +static void sedf_dump_cpu_state(struct scheduler *ops, int i) { struct list_head *list, *queue, *tmp; struct sedf_vcpu_info *d_inf; @@ -1294,7 +1340,7 @@ static void sedf_dump_cpu_state(int i) /* Adjusts periods and slices of the domains accordingly to their weights. */ -static int sedf_adjust_weights(struct xen_domctl_scheduler_op *cmd) +static int sedf_adjust_weights(struct cpupool *c, struct xen_domctl_scheduler_op *cmd) { struct vcpu *p; struct domain *d; @@ -1315,6 +1361,8 @@ static int sedf_adjust_weights(struct xe rcu_read_lock(&domlist_read_lock); for_each_domain( d ) { + if ( c != d->cpupool ) + continue; for_each_vcpu( d, p ) { if ( EDOM_INFO(p)->weight ) @@ -1366,7 +1414,7 @@ static int sedf_adjust_weights(struct xe /* set or fetch domain scheduling parameters */ -static int sedf_adjust(struct domain *p, struct xen_domctl_scheduler_op *op) +static int sedf_adjust(struct scheduler *ops, struct domain *p, struct xen_domctl_scheduler_op *op) { struct vcpu *v; int rc; @@ -1376,9 +1424,6 @@ static int sedf_adjust(struct domain *p, p->domain_id, op->u.sedf.period, op->u.sedf.slice, op->u.sedf.latency, (op->u.sedf.extratime)?"yes":"no"); - if ( !p->vcpu ) - return -EINVAL; - if ( op->cmd == XEN_DOMCTL_SCHEDOP_putinfo ) { /* Check for sane parameters. */ @@ -1428,7 +1473,7 @@ static int sedf_adjust(struct domain *p, } } - rc = sedf_adjust_weights(op); + rc = sedf_adjust_weights(p->cpupool, op); if ( rc ) return rc; @@ -1456,7 +1501,7 @@ static int sedf_adjust(struct domain *p, return 0; } -const struct scheduler sched_sedf_def = { +struct scheduler sched_sedf_def = { .name = "Simple EDF Scheduler", .opt_name = "sedf", .sched_id = XEN_SCHEDULER_SEDF, @@ -1464,9 +1509,15 @@ const struct scheduler sched_sedf_def = .init_domain = sedf_init_domain, .destroy_domain = sedf_destroy_domain, - .init_vcpu = sedf_init_vcpu, .destroy_vcpu = sedf_destroy_vcpu, + .alloc_vdata = sedf_alloc_vdata, + .free_vdata = sedf_free_vdata, + .alloc_pdata = sedf_alloc_pdata, + .free_pdata = sedf_free_pdata, + .alloc_domdata = sedf_alloc_domdata, + .free_domdata = sedf_free_domdata, + .do_schedule = sedf_do_schedule, .pick_cpu = sedf_pick_cpu, .dump_cpu_state = sedf_dump_cpu_state, Index: xen-4.0.1-testing/xen/common/schedule.c =================================================================== --- xen-4.0.1-testing.orig/xen/common/schedule.c +++ xen-4.0.1-testing/xen/common/schedule.c @@ -53,10 +53,11 @@ static void poll_timer_fn(void *data); /* This is global for now so that private implementations can reach it */ DEFINE_PER_CPU(struct schedule_data, schedule_data); +DEFINE_PER_CPU(struct scheduler *, scheduler); -extern const struct scheduler sched_sedf_def; -extern const struct scheduler sched_credit_def; -static const struct scheduler *__initdata schedulers[] = { +extern struct scheduler sched_sedf_def; +extern struct scheduler sched_credit_def; +static struct scheduler *schedulers[] = { &sched_sedf_def, &sched_credit_def, NULL @@ -64,9 +65,15 @@ static const struct scheduler *__initdat static struct scheduler __read_mostly ops; -#define SCHED_OP(fn, ...) \ - (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \ - : (typeof(ops.fn(__VA_ARGS__)))0 ) +#define SCHED_OP(opsptr, fn, ...) \ + (( (opsptr)->fn != NULL ) ? (opsptr)->fn(opsptr, ##__VA_ARGS__ ) \ + : (typeof((opsptr)->fn(opsptr, ##__VA_ARGS__)))0 ) + +#define DOM2OP(_d) (((_d)->cpupool == NULL) ? &ops : &((_d)->cpupool->sched)) +#define VCPU2OP(_v) (DOM2OP((_v)->domain)) +#define VCPU2ONLINE(_v) \ + (((_v)->domain->cpupool == NULL) ? &cpu_online_map \ + : &(_v)->domain->cpupool->cpu_valid) static inline void trace_runstate_change(struct vcpu *v, int new_state) { @@ -207,7 +214,86 @@ int sched_init_vcpu(struct vcpu *v, unsi TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id); - return SCHED_OP(init_vcpu, v); + if ( unlikely(per_cpu(schedule_data, v->processor).sched_priv == NULL) ) + { + per_cpu(schedule_data, v->processor).sched_priv = + SCHED_OP(DOM2OP(d), alloc_pdata, processor); + if ( per_cpu(schedule_data, v->processor).sched_priv == NULL ) + return 1; + } + + v->sched_priv = SCHED_OP(DOM2OP(d), alloc_vdata, v, d->sched_priv); + if ( v->sched_priv == NULL ) + return 1; + + if ( is_idle_domain(d) ) + per_cpu(schedule_data, v->processor).sched_idlevpriv = v->sched_priv; + + return 0; +} + +int sched_move_domain(struct domain *d, struct cpupool *c) +{ + struct vcpu *v; + unsigned int new_p; + void **vcpu_priv; + void *domdata; + + domdata = SCHED_OP(&(c->sched), alloc_domdata, d); + if ( domdata == NULL ) + return -ENOMEM; + + vcpu_priv = xmalloc_array(void *, d->max_vcpus); + if ( vcpu_priv == NULL ) + { + SCHED_OP(&(c->sched), free_domdata, domdata); + return -ENOMEM; + } + + memset(vcpu_priv, 0, d->max_vcpus * sizeof(void *)); + for_each_vcpu ( d, v ) + { + vcpu_priv[v->vcpu_id] = SCHED_OP(&(c->sched), alloc_vdata, v, domdata); + if ( vcpu_priv[v->vcpu_id] == NULL ) + { + for_each_vcpu ( d, v ) + { + if ( vcpu_priv[v->vcpu_id] != NULL ) + xfree(vcpu_priv[v->vcpu_id]); + } + xfree(vcpu_priv); + SCHED_OP(&(c->sched), free_domdata, domdata); + return -ENOMEM; + } + } + + domain_pause(d); + + new_p = first_cpu(c->cpu_valid); + for_each_vcpu ( d, v ) + { + migrate_timer(&v->periodic_timer, new_p); + migrate_timer(&v->singleshot_timer, new_p); + migrate_timer(&v->poll_timer, new_p); + + SCHED_OP(VCPU2OP(v), destroy_vcpu, v); + + cpus_setall(v->cpu_affinity); + v->processor = new_p; + v->sched_priv = vcpu_priv[v->vcpu_id]; + + new_p = cycle_cpu(new_p, c->cpu_valid); + } + + d->cpupool = c; + SCHED_OP(DOM2OP(d), free_domdata, d->sched_priv); + d->sched_priv = domdata; + + domain_unpause(d); + + xfree(vcpu_priv); + + return 0; } void sched_destroy_vcpu(struct vcpu *v) @@ -217,17 +303,17 @@ void sched_destroy_vcpu(struct vcpu *v) kill_timer(&v->poll_timer); if ( test_and_clear_bool(v->is_urgent) ) atomic_dec(&per_cpu(schedule_data, v->processor).urgent_count); - SCHED_OP(destroy_vcpu, v); + SCHED_OP(VCPU2OP(v), destroy_vcpu, v); } int sched_init_domain(struct domain *d) { - return SCHED_OP(init_domain, d); + return SCHED_OP(DOM2OP(d), init_domain, d); } void sched_destroy_domain(struct domain *d) { - SCHED_OP(destroy_domain, d); + SCHED_OP(DOM2OP(d), destroy_domain, d); } void vcpu_sleep_nosync(struct vcpu *v) @@ -241,7 +327,7 @@ void vcpu_sleep_nosync(struct vcpu *v) if ( v->runstate.state == RUNSTATE_runnable ) vcpu_runstate_change(v, RUNSTATE_offline, NOW()); - SCHED_OP(sleep, v); + SCHED_OP(VCPU2OP(v), sleep, v); } vcpu_schedule_unlock_irqrestore(v, flags); @@ -269,7 +355,7 @@ void vcpu_wake(struct vcpu *v) { if ( v->runstate.state >= RUNSTATE_blocked ) vcpu_runstate_change(v, RUNSTATE_runnable, NOW()); - SCHED_OP(wake, v); + SCHED_OP(VCPU2OP(v), wake, v); } else if ( !test_bit(_VPF_blocked, &v->pause_flags) ) { @@ -324,7 +410,7 @@ static void vcpu_migrate(struct vcpu *v) /* Select new CPU. */ old_cpu = v->processor; - new_cpu = SCHED_OP(pick_cpu, v); + new_cpu = SCHED_OP(VCPU2OP(v), pick_cpu, v); /* * Transfer urgency status to new CPU before switching CPUs, as once @@ -367,22 +453,32 @@ void vcpu_force_reschedule(struct vcpu * } /* - * This function is used by cpu_hotplug code from stop_machine context. - * Hence we can avoid needing to take the + * This function is used by cpu_hotplug code from stop_machine context + * and from cpupools to switch schedulers on a cpu. */ -void cpu_disable_scheduler(void) +int cpu_disable_scheduler(unsigned int cpu, int lock) { struct domain *d; struct vcpu *v; - unsigned int cpu = smp_processor_id(); + struct cpupool *c; + int ret = 0; + + c = per_cpu(cpupool, cpu); + if ( c == NULL ) + return ret; for_each_domain ( d ) { + if ( d->cpupool != c ) + continue; + for_each_vcpu ( d, v ) { if ( is_idle_vcpu(v) ) continue; + if ( lock != 0 ) + vcpu_schedule_lock_irq(v); if ( (cpus_weight(v->cpu_affinity) == 1) && cpu_isset(cpu, v->cpu_affinity) ) { @@ -396,39 +492,51 @@ void cpu_disable_scheduler(void) * be chosen when the timer is next re-set. */ if ( v->singleshot_timer.cpu == cpu ) - migrate_timer(&v->singleshot_timer, 0); + { + int cpu_mig; + + cpu_mig = first_cpu(c->cpu_valid); + if (cpu_mig == cpu) + cpu_mig = next_cpu(cpu_mig, c->cpu_valid); + migrate_timer(&v->singleshot_timer, cpu_mig); + } if ( v->processor == cpu ) { set_bit(_VPF_migrating, &v->pause_flags); + if ( lock != 0 ) + vcpu_schedule_unlock_irq(v); vcpu_sleep_nosync(v); vcpu_migrate(v); } + else if ( lock != 0 ) + vcpu_schedule_unlock_irq(v); + /* + * A vcpu active in the hypervisor will not be migratable. + * The caller should try again after releasing and reaquiring + * all locks. + */ + if ( v->processor == cpu ) + ret = -EAGAIN; } } + return ret; } -static int __vcpu_set_affinity( - struct vcpu *v, cpumask_t *affinity, - bool_t old_lock_status, bool_t new_lock_status) +int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity) { cpumask_t online_affinity, old_affinity; + cpumask_t *online; - cpus_and(online_affinity, *affinity, cpu_online_map); + if ( v->domain->is_pinned ) + return -EINVAL; + online = VCPU2ONLINE(v); + cpus_and(online_affinity, *affinity, *online); if ( cpus_empty(online_affinity) ) return -EINVAL; vcpu_schedule_lock_irq(v); - if ( v->affinity_locked != old_lock_status ) - { - BUG_ON(!v->affinity_locked); - vcpu_schedule_unlock_irq(v); - return -EBUSY; - } - - v->affinity_locked = new_lock_status; - old_affinity = v->cpu_affinity; v->cpu_affinity = *affinity; *affinity = old_affinity; @@ -446,36 +554,6 @@ static int __vcpu_set_affinity( return 0; } -int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity) -{ - if ( v->domain->is_pinned ) - return -EINVAL; - return __vcpu_set_affinity(v, affinity, 0, 0); -} - -int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity) -{ - return __vcpu_set_affinity(v, affinity, 0, 1); -} - -int vcpu_locked_change_affinity(struct vcpu *v, cpumask_t *affinity) -{ - return __vcpu_set_affinity(v, affinity, 1, 1); -} - -void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity) -{ - cpumask_t online_affinity; - - /* Do not fail if no CPU in old affinity mask is online. */ - cpus_and(online_affinity, *affinity, cpu_online_map); - if ( cpus_empty(online_affinity) ) - *affinity = cpu_online_map; - - if ( __vcpu_set_affinity(v, affinity, 1, 0) != 0 ) - BUG(); -} - /* Block the currently-executing domain until a pertinent event occurs. */ static long do_block(void) { @@ -783,7 +861,7 @@ long sched_adjust(struct domain *d, stru struct vcpu *v; long ret; - if ( (op->sched_id != ops.sched_id) || + if ( (op->sched_id != DOM2OP(d)->sched_id) || ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) && (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) ) return -EINVAL; @@ -810,7 +888,7 @@ long sched_adjust(struct domain *d, stru if ( d == current->domain ) vcpu_schedule_lock_irq(current); - if ( (ret = SCHED_OP(adjust, d, op)) == 0 ) + if ( (ret = SCHED_OP(DOM2OP(d), adjust, d, op)) == 0 ) TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id); if ( d == current->domain ) @@ -857,6 +935,7 @@ static void schedule(void) { struct vcpu *prev = current, *next = NULL; s_time_t now = NOW(); + struct scheduler *sched = this_cpu(scheduler); struct schedule_data *sd; struct task_slice next_slice; @@ -872,7 +951,7 @@ static void schedule(void) stop_timer(&sd->s_timer); /* get policy-specific decision on scheduling... */ - next_slice = ops.do_schedule(now); + next_slice = sched->do_schedule(sched, now); next = next_slice.task; @@ -978,6 +1057,19 @@ static void poll_timer_fn(void *data) vcpu_unblock(v); } +/* Get scheduler by id */ +struct scheduler *scheduler_get_by_id(unsigned int id) +{ + int i; + + for ( i = 0; schedulers[i] != NULL; i++ ) + { + if ( schedulers[i]->sched_id == id ) + return schedulers[i]; + } + return NULL; +} + /* Initialise the data structures. */ void __init scheduler_init(void) { @@ -985,12 +1077,6 @@ void __init scheduler_init(void) open_softirq(SCHEDULE_SOFTIRQ, schedule); - for_each_possible_cpu ( i ) - { - spin_lock_init(&per_cpu(schedule_data, i).schedule_lock); - init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i); - } - for ( i = 0; schedulers[i] != NULL; i++ ) { ops = *schedulers[i]; @@ -1004,43 +1090,123 @@ void __init scheduler_init(void) ops = *schedulers[0]; } + for_each_possible_cpu ( i ) + { + per_cpu(scheduler, i) = &ops; + spin_lock_init(&per_cpu(schedule_data, i).schedule_lock); + init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i); + } + printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name); - SCHED_OP(init); + if ( SCHED_OP(&ops, init) ) + panic("scheduler returned error on init\n"); } -void dump_runq(unsigned char key) +/* switch scheduler on cpu */ +void schedule_cpu_switch(unsigned int cpu, struct cpupool *c) { - s_time_t now = NOW(); - int i; unsigned long flags; + struct vcpu *v; + void *vpriv = NULL; + void *ppriv; + void *ppriv_old; + struct scheduler *old_ops; + struct scheduler *new_ops; + + old_ops = per_cpu(scheduler, cpu); + new_ops = (c == NULL) ? &ops : &(c->sched); + v = per_cpu(schedule_data, cpu).idle; + ppriv = SCHED_OP(new_ops, alloc_pdata, cpu); + if ( c != NULL ) + vpriv = SCHED_OP(new_ops, alloc_vdata, v, v->domain->sched_priv); + + spin_lock_irqsave(&per_cpu(schedule_data, cpu).schedule_lock, flags); + + if ( c == NULL ) + { + vpriv = v->sched_priv; + v->sched_priv = per_cpu(schedule_data, cpu).sched_idlevpriv; + } + else + { + v->sched_priv = vpriv; + vpriv = NULL; + } + SCHED_OP(old_ops, tick_suspend, cpu); + per_cpu(scheduler, cpu) = new_ops; + ppriv_old = per_cpu(schedule_data, cpu).sched_priv; + per_cpu(schedule_data, cpu).sched_priv = ppriv; + SCHED_OP(new_ops, tick_resume, cpu); + SCHED_OP(new_ops, insert_vcpu, v); + + spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags); + + if ( vpriv != NULL ) + SCHED_OP(old_ops, free_vdata, vpriv); + SCHED_OP(old_ops, free_pdata, ppriv_old, cpu); +} + +/* init scheduler global data */ +int schedule_init_global(char *name, struct scheduler *sched) +{ + int i; + struct scheduler *data; + + data = &ops; + for ( i = 0; (schedulers[i] != NULL) && (name != NULL) ; i++ ) + { + if ( strcmp(schedulers[i]->opt_name, name) == 0 ) + { + data = schedulers[i]; + break; + } + } + memcpy(sched, data, sizeof(*sched)); + return SCHED_OP(sched, init); +} - local_irq_save(flags); +/* deinitialize scheduler global data */ +void schedule_deinit_global(struct scheduler *sched) +{ + SCHED_OP(sched, deinit); +} - printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name); - SCHED_OP(dump_settings); - printk("sched_smt_power_savings: %s\n", - sched_smt_power_savings? "enabled":"disabled"); - printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now); +void schedule_dump(struct cpupool *c) +{ + int i; + struct scheduler *sched; + cpumask_t *cpus; + + sched = (c == NULL) ? &ops : &(c->sched); + cpus = (c == NULL) ? &cpupool_free_cpus : &c->cpu_valid; + printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name); + SCHED_OP(sched, dump_settings); - for_each_online_cpu ( i ) + for_each_cpu_mask (i, *cpus) { spin_lock(&per_cpu(schedule_data, i).schedule_lock); printk("CPU[%02d] ", i); - SCHED_OP(dump_cpu_state, i); + SCHED_OP(sched, dump_cpu_state, i); spin_unlock(&per_cpu(schedule_data, i).schedule_lock); } - - local_irq_restore(flags); } void sched_tick_suspend(void) { - SCHED_OP(tick_suspend); + struct scheduler *sched; + unsigned int cpu = smp_processor_id(); + + sched = per_cpu(scheduler, cpu); + SCHED_OP(sched, tick_suspend, cpu); } void sched_tick_resume(void) { - SCHED_OP(tick_resume); + struct scheduler *sched; + unsigned int cpu = smp_processor_id(); + + sched = per_cpu(scheduler, cpu); + SCHED_OP(sched, tick_resume, cpu); } #ifdef CONFIG_COMPAT Index: xen-4.0.1-testing/xen/common/softirq.c =================================================================== --- xen-4.0.1-testing.orig/xen/common/softirq.c +++ xen-4.0.1-testing/xen/common/softirq.c @@ -88,9 +88,11 @@ void raise_softirq(unsigned int nr) } static LIST_HEAD(tasklet_list); +static DEFINE_PER_CPU(struct list_head, tasklet_list_pcpu); static DEFINE_SPINLOCK(tasklet_lock); -void tasklet_schedule(struct tasklet *t) +static void tasklet_schedule_list(struct tasklet *t, struct list_head *tlist, + int cpu) { unsigned long flags; @@ -101,28 +103,44 @@ void tasklet_schedule(struct tasklet *t) if ( !t->is_scheduled && !t->is_running ) { BUG_ON(!list_empty(&t->list)); - list_add_tail(&t->list, &tasklet_list); + list_add_tail(&t->list, tlist); } t->is_scheduled = 1; - raise_softirq(TASKLET_SOFTIRQ); + if ( cpu == smp_processor_id() ) + raise_softirq(TASKLET_SOFTIRQ); + else + cpu_raise_softirq(cpu, TASKLET_SOFTIRQ); } spin_unlock_irqrestore(&tasklet_lock, flags); } +void tasklet_schedule(struct tasklet *t) +{ + tasklet_schedule_list(t, &tasklet_list, smp_processor_id()); +} + +void tasklet_schedule_cpu(struct tasklet *t, int cpu) +{ + tasklet_schedule_list(t, &per_cpu(tasklet_list_pcpu, cpu), cpu); +} + static void tasklet_action(void) { + struct list_head *tlist; struct tasklet *t; spin_lock_irq(&tasklet_lock); - if ( list_empty(&tasklet_list) ) + tlist = ( list_empty(&this_cpu(tasklet_list_pcpu)) ) ? &tasklet_list : + &this_cpu(tasklet_list_pcpu); + if ( list_empty(tlist) ) { spin_unlock_irq(&tasklet_lock); return; } - t = list_entry(tasklet_list.next, struct tasklet, list); + t = list_entry(tlist->next, struct tasklet, list); list_del_init(&t->list); BUG_ON(t->is_dead || t->is_running || !t->is_scheduled); @@ -138,14 +156,15 @@ static void tasklet_action(void) if ( t->is_scheduled ) { BUG_ON(t->is_dead || !list_empty(&t->list)); - list_add_tail(&t->list, &tasklet_list); + list_add_tail(&t->list, tlist); } /* * If there is more work to do then reschedule. We don't grab more work * immediately as we want to allow other softirq work to happen first. */ - if ( !list_empty(&tasklet_list) ) + if ( !list_empty(&tasklet_list) || + !list_empty(&this_cpu(tasklet_list_pcpu)) ) raise_softirq(TASKLET_SOFTIRQ); spin_unlock_irq(&tasklet_lock); @@ -186,6 +205,12 @@ void tasklet_init( void __init softirq_init(void) { + int i; + + for_each_possible_cpu ( i ) + { + INIT_LIST_HEAD(&per_cpu(tasklet_list_pcpu, i)); + } open_softirq(TASKLET_SOFTIRQ, tasklet_action); } Index: xen-4.0.1-testing/xen/common/sysctl.c =================================================================== --- xen-4.0.1-testing.orig/xen/common/sysctl.c +++ xen-4.0.1-testing/xen/common/sysctl.c @@ -314,6 +314,14 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc } break; + case XEN_SYSCTL_cpupool_op: + { + ret = cpupool_do_sysctl(&op->u.cpupool_op); + if ( (ret == 0) && copy_to_guest(u_sysctl, op, 1) ) + ret = -EFAULT; + } + break; + default: ret = arch_do_sysctl(op, u_sysctl); break; Index: xen-4.0.1-testing/xen/include/asm-x86/domain.h =================================================================== --- xen-4.0.1-testing.orig/xen/include/asm-x86/domain.h +++ xen-4.0.1-testing/xen/include/asm-x86/domain.h @@ -458,7 +458,8 @@ struct arch_vcpu #define hvm_svm hvm_vcpu.u.svm /* Continue the current hypercall via func(data) on specified cpu. */ -int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data); +int continue_hypercall_on_cpu(int cpu, void *hdl, + long (*func)(void *hdl, void *data), void *data); void vcpu_show_execution_state(struct vcpu *); void vcpu_show_registers(const struct vcpu *); Index: xen-4.0.1-testing/xen/include/asm-x86/smp.h =================================================================== --- xen-4.0.1-testing.orig/xen/include/asm-x86/smp.h +++ xen-4.0.1-testing/xen/include/asm-x86/smp.h @@ -56,7 +56,6 @@ extern u32 cpu_2_logical_apicid[]; #define CPU_ONLINE 0x0002 /* CPU is up */ #define CPU_DEAD 0x0004 /* CPU is dead */ DECLARE_PER_CPU(int, cpu_state); -extern spinlock_t(cpu_add_remove_lock); #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu)) extern int cpu_down(unsigned int cpu); Index: xen-4.0.1-testing/xen/include/public/domctl.h =================================================================== --- xen-4.0.1-testing.orig/xen/include/public/domctl.h +++ xen-4.0.1-testing/xen/include/public/domctl.h @@ -60,10 +60,10 @@ struct xen_domctl_createdomain { /* Should domain memory integrity be verifed by tboot during Sx? */ #define _XEN_DOMCTL_CDF_s3_integrity 2 #define XEN_DOMCTL_CDF_s3_integrity (1U<<_XEN_DOMCTL_CDF_s3_integrity) - uint32_t flags; /* Disable out-of-sync shadow page tables? */ #define _XEN_DOMCTL_CDF_oos_off 3 #define XEN_DOMCTL_CDF_oos_off (1U<<_XEN_DOMCTL_CDF_oos_off) + uint32_t flags; }; typedef struct xen_domctl_createdomain xen_domctl_createdomain_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_createdomain_t); @@ -106,6 +106,7 @@ struct xen_domctl_getdomaininfo { uint32_t max_vcpu_id; /* Maximum VCPUID in use by this domain. */ uint32_t ssidref; xen_domain_handle_t handle; + uint32_t cpupool; }; typedef struct xen_domctl_getdomaininfo xen_domctl_getdomaininfo_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_getdomaininfo_t); @@ -781,7 +782,6 @@ struct xen_domctl_mem_sharing_op { typedef struct xen_domctl_mem_sharing_op xen_domctl_mem_sharing_op_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_mem_sharing_op_t); - struct xen_domctl { uint32_t cmd; #define XEN_DOMCTL_createdomain 1 Index: xen-4.0.1-testing/xen/include/public/sysctl.h =================================================================== --- xen-4.0.1-testing.orig/xen/include/public/sysctl.h +++ xen-4.0.1-testing/xen/include/public/sysctl.h @@ -491,6 +491,28 @@ struct xen_sysctl_lockprof_op { typedef struct xen_sysctl_lockprof_op xen_sysctl_lockprof_op_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_lockprof_op_t); +#define XEN_SYSCTL_cpupool_op 18 +/* XEN_SYSCTL_cpupool_op */ +#define XEN_SYSCTL_CPUPOOL_OP_CREATE 1 /* C */ +#define XEN_SYSCTL_CPUPOOL_OP_DESTROY 2 /* D */ +#define XEN_SYSCTL_CPUPOOL_OP_INFO 3 /* I */ +#define XEN_SYSCTL_CPUPOOL_OP_ADDCPU 4 /* A */ +#define XEN_SYSCTL_CPUPOOL_OP_RMCPU 5 /* R */ +#define XEN_SYSCTL_CPUPOOL_OP_MOVEDOMAIN 6 /* M */ +#define XEN_SYSCTL_CPUPOOL_OP_FREEINFO 7 /* F */ +#define XEN_SYSCTL_CPUPOOL_PAR_ANY 0xFFFFFFFF +struct xen_sysctl_cpupool_op { + uint32_t op; /* IN */ + uint32_t cpupool_id; /* IN: CDIARM OUT: CI */ + uint32_t sched_id; /* IN: C OUT: I */ + uint32_t domid; /* IN: M */ + uint32_t cpu; /* IN: AR */ + uint32_t n_dom; /* OUT: I */ + struct xenctl_cpumap cpumap; /* OUT: IF */ +}; +typedef struct xen_sysctl_cpupool_op xen_sysctl_cpupool_op_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpupool_op_t); + struct xen_sysctl { uint32_t cmd; uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */ @@ -509,6 +531,7 @@ struct xen_sysctl { struct xen_sysctl_pm_op pm_op; struct xen_sysctl_page_offline_op page_offline; struct xen_sysctl_lockprof_op lockprof_op; + struct xen_sysctl_cpupool_op cpupool_op; uint8_t pad[128]; } u; }; Index: xen-4.0.1-testing/xen/include/xen/sched-if.h =================================================================== --- xen-4.0.1-testing.orig/xen/include/xen/sched-if.h +++ xen-4.0.1-testing/xen/include/xen/sched-if.h @@ -10,16 +10,29 @@ #include + +/* A global pointer to the initial cpupool (POOL0). */ +extern struct cpupool *cpupool0; + +/* cpus currently in no cpupool */ +extern cpumask_t cpupool_free_cpus; + +/* cpupool lock (used for cpu on/offline, too) */ +extern spinlock_t cpupool_lock; + struct schedule_data { spinlock_t schedule_lock; /* spinlock protecting curr */ struct vcpu *curr; /* current task */ struct vcpu *idle; /* idle task for this cpu */ void *sched_priv; + void *sched_idlevpriv; /* default scheduler vcpu data */ struct timer s_timer; /* scheduling timer */ atomic_t urgent_count; /* how many urgent vcpus */ } __cacheline_aligned; DECLARE_PER_CPU(struct schedule_data, schedule_data); +DECLARE_PER_CPU(struct scheduler *, scheduler); +DECLARE_PER_CPU(struct cpupool *, cpupool); static inline void vcpu_schedule_lock(struct vcpu *v) { @@ -59,28 +72,49 @@ struct scheduler { char *name; /* full name for this scheduler */ char *opt_name; /* option name for this scheduler */ unsigned int sched_id; /* ID for this scheduler */ + void *sched_data; /* global data pointer */ + + int (*init) (struct scheduler *); + void (*deinit) (struct scheduler *); - void (*init) (void); + void (*free_vdata) (struct scheduler *, void *); + void * (*alloc_vdata) (struct scheduler *, struct vcpu *, + void *); + void (*free_pdata) (struct scheduler *, void *, int); + void * (*alloc_pdata) (struct scheduler *, int); + void (*free_domdata) (struct scheduler *, void *); + void * (*alloc_domdata) (struct scheduler *, struct domain *); - int (*init_domain) (struct domain *); - void (*destroy_domain) (struct domain *); + int (*init_domain) (struct scheduler *, struct domain *); + void (*destroy_domain) (struct scheduler *, struct domain *); - int (*init_vcpu) (struct vcpu *); - void (*destroy_vcpu) (struct vcpu *); + void (*insert_vcpu) (struct scheduler *, struct vcpu *); + void (*destroy_vcpu) (struct scheduler *, struct vcpu *); - void (*sleep) (struct vcpu *); - void (*wake) (struct vcpu *); + void (*sleep) (struct scheduler *, struct vcpu *); + void (*wake) (struct scheduler *, struct vcpu *); - struct task_slice (*do_schedule) (s_time_t); + struct task_slice (*do_schedule) (struct scheduler *, s_time_t); - int (*pick_cpu) (struct vcpu *); - int (*adjust) (struct domain *, + int (*pick_cpu) (struct scheduler *, struct vcpu *); + int (*adjust) (struct scheduler *, struct domain *, struct xen_domctl_scheduler_op *); - void (*dump_settings) (void); - void (*dump_cpu_state) (int); + void (*dump_settings) (struct scheduler *); + void (*dump_cpu_state) (struct scheduler *, int); - void (*tick_suspend) (void); - void (*tick_resume) (void); + void (*tick_suspend) (struct scheduler *, unsigned int); + void (*tick_resume) (struct scheduler *, unsigned int); }; +struct cpupool +{ + int cpupool_id; + cpumask_t cpu_valid; /* all cpus assigned to pool */ + struct cpupool *next; + unsigned int n_dom; + struct scheduler sched; +}; + +struct scheduler *scheduler_get_by_id(unsigned int id); + #endif /* __XEN_SCHED_IF_H__ */ Index: xen-4.0.1-testing/xen/include/xen/sched.h =================================================================== --- xen-4.0.1-testing.orig/xen/include/xen/sched.h +++ xen-4.0.1-testing/xen/include/xen/sched.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -132,8 +133,6 @@ struct vcpu bool_t defer_shutdown; /* VCPU is paused following shutdown request (d->is_shutting_down)? */ bool_t paused_for_shutdown; - /* VCPU affinity is temporarily locked from controller changes? */ - bool_t affinity_locked; /* * > 0: a single port is being polled; @@ -209,6 +208,7 @@ struct domain /* Scheduling. */ void *sched_priv; /* scheduler-specific data */ + struct cpupool *cpupool; struct domain *next_in_list; struct domain *next_in_hashbucket; @@ -381,7 +381,7 @@ static inline struct domain *get_current } struct domain *domain_create( - domid_t domid, unsigned int domcr_flags, ssidref_t ssidref); + domid_t domid, int poolid, unsigned int domcr_flags, ssidref_t ssidref); /* DOMCRF_hvm: Create an HVM domain, as opposed to a PV domain. */ #define _DOMCRF_hvm 0 #define DOMCRF_hvm (1U<<_DOMCRF_hvm) @@ -469,6 +469,7 @@ int sched_init_vcpu(struct vcpu *v, uns void sched_destroy_vcpu(struct vcpu *v); int sched_init_domain(struct domain *d); void sched_destroy_domain(struct domain *d); +int sched_move_domain(struct domain *d, struct cpupool *c); long sched_adjust(struct domain *, struct xen_domctl_scheduler_op *); int sched_id(void); void sched_tick_suspend(void); @@ -576,12 +577,14 @@ void domain_pause_by_systemcontroller(st void domain_unpause_by_systemcontroller(struct domain *d); void cpu_init(void); +struct scheduler; + +int schedule_init_global(char *name, struct scheduler *sched); +void schedule_deinit_global(struct scheduler *sched); +void schedule_cpu_switch(unsigned int cpu, struct cpupool *c); void vcpu_force_reschedule(struct vcpu *v); -void cpu_disable_scheduler(void); +int cpu_disable_scheduler(unsigned int cpu, int lock); int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity); -int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity); -int vcpu_locked_change_affinity(struct vcpu *v, cpumask_t *affinity); -void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity); void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate); uint64_t get_cpu_idle_time(unsigned int cpu); @@ -604,6 +607,18 @@ extern enum cpufreq_controller { FREQCTL_none, FREQCTL_dom0_kernel, FREQCTL_xen } cpufreq_controller; +#define CPUPOOLID_NONE -1 + +struct cpupool *cpupool_create(int poolid, char *sched); +int cpupool_destroy(struct cpupool *c); +int cpupool0_cpu_assign(struct cpupool *c); +int cpupool_assign_ncpu(struct cpupool *c, int ncpu); +void cpupool_cpu_add(unsigned int cpu); +int cpupool_add_domain(struct domain *d, int poolid); +void cpupool_rm_domain(struct domain *d); +int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op); +#define num_cpupool_cpus(c) (cpus_weight((c)->cpu_valid)) + #endif /* __SCHED_H__ */ /* Index: xen-4.0.1-testing/xen/include/xen/softirq.h =================================================================== --- xen-4.0.1-testing.orig/xen/include/xen/softirq.h +++ xen-4.0.1-testing/xen/include/xen/softirq.h @@ -58,6 +58,7 @@ struct tasklet struct tasklet name = { LIST_HEAD_INIT(name.list), 0, 0, 0, func, data } void tasklet_schedule(struct tasklet *t); +void tasklet_schedule_cpu(struct tasklet *t, int cpu); void tasklet_kill(struct tasklet *t); void tasklet_init( struct tasklet *t, void (*func)(unsigned long), unsigned long data);