#include #include #include #include #include #include #include #include #include #include #include "sched.h" #include "tune.h" bool schedtune_initialized = false; extern struct reciprocal_value schedtune_spc_rdiv; /* We hold schedtune boost in effect for at least this long */ #define SCHEDTUNE_BOOST_HOLD_NS 50000000ULL /* * EAS scheduler tunables for task groups. */ /* SchdTune tunables for a group of tasks */ struct schedtune { /* SchedTune CGroup subsystem */ struct cgroup_subsys_state css; /* Boost group allocated ID */ int idx; /* Boost value for tasks on that SchedTune CGroup */ int boost; /* Hint to bias scheduling of tasks on that SchedTune CGroup * towards idle CPUs */ int prefer_idle; /* Hint to bias scheduling of tasks on that SchedTune CGroup * towards high performance CPUs */ int prefer_perf; /* SchedTune util-est */ int util_est_en; /* Hint to group tasks by process */ int band; /* SchedTune ontime migration */ int ontime_en; }; static inline struct schedtune *css_st(struct cgroup_subsys_state *css) { return css ? container_of(css, struct schedtune, css) : NULL; } static inline struct schedtune *task_schedtune(struct task_struct *tsk) { return css_st(task_css(tsk, schedtune_cgrp_id)); } static inline struct schedtune *parent_st(struct schedtune *st) { return css_st(st->css.parent); } /* * SchedTune root control group * The root control group is used to defined a system-wide boosting tuning, * which is applied to all tasks in the system. * Task specific boost tuning could be specified by creating and * configuring a child control group under the root one. * By default, system-wide boosting is disabled, i.e. no boosting is applied * to tasks which are not into a child control group. */ static struct schedtune root_schedtune = { .boost = 0, .prefer_idle = 0, .prefer_perf = 0, .band = 0, }; /* * Maximum number of boost groups to support * When per-task boosting is used we still allow only limited number of * boost groups for two main reasons: * 1. on a real system we usually have only few classes of workloads which * make sense to boost with different values (e.g. background vs foreground * tasks, interactive vs low-priority tasks) * 2. a limited number allows for a simpler and more memory/time efficient * implementation especially for the computation of the per-CPU boost * value */ #define BOOSTGROUPS_COUNT 5 /* Array of configured boostgroups */ static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = { &root_schedtune, NULL, }; /* SchedTune boost groups * Keep track of all the boost groups which impact on CPU, for example when a * CPU has two RUNNABLE tasks belonging to two different boost groups and thus * likely with different boost values. * Since on each system we expect only a limited number of boost groups, here * we use a simple array to keep track of the metrics required to compute the * maximum per-CPU boosting value. */ struct boost_groups { /* Maximum boost value for all RUNNABLE tasks on a CPU */ bool idle; int boost_max; u64 boost_ts; struct { /* The boost for tasks on that boost group */ int boost; /* Count of RUNNABLE tasks on that boost group */ unsigned tasks; /* Timestamp of boost activation */ u64 ts; } group[BOOSTGROUPS_COUNT]; /* CPU's boost group locking */ raw_spinlock_t lock; }; /* Boost groups affecting each CPU in the system */ DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups); static inline bool schedtune_boost_timeout(u64 now, u64 ts) { return ((now - ts) > SCHEDTUNE_BOOST_HOLD_NS); } static inline bool schedtune_boost_group_active(int idx, struct boost_groups* bg, u64 now) { if (bg->group[idx].tasks) return true; return !schedtune_boost_timeout(now, bg->group[idx].ts); } static void schedtune_cpu_update(int cpu, u64 now) { struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); int boost_max; u64 boost_ts; int idx; /* The root boost group is always active */ boost_max = bg->group[0].boost; boost_ts = now; for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) { /* * A boost group affects a CPU only if it has * RUNNABLE tasks on that CPU or it has hold * in effect from a previous task. */ if (!schedtune_boost_group_active(idx, bg, now)) continue; /* This boost group is active */ if (boost_max > bg->group[idx].boost) continue; boost_max = bg->group[idx].boost; boost_ts = bg->group[idx].ts; } /* Ensures boost_max is non-negative when all cgroup boost values * are neagtive. Avoids under-accounting of cpu capacity which may cause * task stacking and frequency spikes.*/ boost_max = max(boost_max, 0); bg->boost_max = boost_max; bg->boost_ts = boost_ts; } static int schedtune_boostgroup_update(int idx, int boost) { struct boost_groups *bg; int cur_boost_max; int old_boost; int cpu; u64 now; /* Update per CPU boost groups */ for_each_possible_cpu(cpu) { bg = &per_cpu(cpu_boost_groups, cpu); /* * Keep track of current boost values to compute the per CPU * maximum only when it has been affected by the new value of * the updated boost group */ cur_boost_max = bg->boost_max; old_boost = bg->group[idx].boost; /* Update the boost value of this boost group */ bg->group[idx].boost = boost; /* Check if this update increase current max */ now = sched_clock_cpu(cpu); if (boost > cur_boost_max && schedtune_boost_group_active(idx, bg, now)) { bg->boost_max = boost; bg->boost_ts = bg->group[idx].ts; trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max); continue; } /* Check if this update has decreased current max */ if (cur_boost_max == old_boost && old_boost > boost) { schedtune_cpu_update(cpu, now); trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max); continue; } trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max); } return 0; } #define ENQUEUE_TASK 1 #define DEQUEUE_TASK -1 static inline bool schedtune_update_timestamp(struct task_struct *p) { if (sched_feat(SCHEDTUNE_BOOST_HOLD_ALL)) return true; return task_has_rt_policy(p); } static inline void schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) { struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); int tasks = bg->group[idx].tasks + task_count; /* Update boosted tasks count while avoiding to make it negative */ bg->group[idx].tasks = max(0, tasks); /* Update timeout on enqueue */ if (task_count > 0) { u64 now = sched_clock_cpu(cpu); if (schedtune_update_timestamp(p)) bg->group[idx].ts = now; /* Boost group activation or deactivation on that RQ */ if (bg->group[idx].tasks == 1) schedtune_cpu_update(cpu, now); } trace_sched_tune_tasks_update(p, cpu, tasks, idx, bg->group[idx].boost, bg->boost_max, bg->group[idx].ts); } /* * NOTE: This function must be called while holding the lock on the CPU RQ */ void schedtune_enqueue_task(struct task_struct *p, int cpu) { struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); unsigned long irq_flags; struct schedtune *st; int idx; if (unlikely(!schedtune_initialized)) return; /* * Boost group accouting is protected by a per-cpu lock and requires * interrupt to be disabled to avoid race conditions for example on * do_exit()::cgroup_exit() and task migration. */ raw_spin_lock_irqsave(&bg->lock, irq_flags); rcu_read_lock(); st = task_schedtune(p); idx = st->idx; schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK); rcu_read_unlock(); raw_spin_unlock_irqrestore(&bg->lock, irq_flags); } int schedtune_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; struct boost_groups *bg; struct rq_flags rq_flags; unsigned int cpu; struct rq *rq; int src_bg; /* Source boost group index */ int dst_bg; /* Destination boost group index */ int tasks; u64 now; if (unlikely(!schedtune_initialized)) return 0; cgroup_taskset_for_each(task, css, tset) { /* * Lock the CPU's RQ the task is enqueued to avoid race * conditions with migration code while the task is being * accounted */ rq = task_rq_lock(task, &rq_flags); if (!task->on_rq) { task_rq_unlock(rq, task, &rq_flags); continue; } /* * Boost group accouting is protected by a per-cpu lock and requires * interrupt to be disabled to avoid race conditions on... */ cpu = cpu_of(rq); bg = &per_cpu(cpu_boost_groups, cpu); raw_spin_lock(&bg->lock); dst_bg = css_st(css)->idx; src_bg = task_schedtune(task)->idx; /* * Current task is not changing boostgroup, which can * happen when the new hierarchy is in use. */ if (unlikely(dst_bg == src_bg)) { raw_spin_unlock(&bg->lock); task_rq_unlock(rq, task, &rq_flags); continue; } /* * This is the case of a RUNNABLE task which is switching its * current boost group. */ /* Move task from src to dst boost group */ tasks = bg->group[src_bg].tasks - 1; bg->group[src_bg].tasks = max(0, tasks); bg->group[dst_bg].tasks += 1; /* Update boost hold start for this group */ now = sched_clock_cpu(cpu); bg->group[dst_bg].ts = now; /* Force boost group re-evaluation at next boost check */ bg->boost_ts = now - SCHEDTUNE_BOOST_HOLD_NS; raw_spin_unlock(&bg->lock); task_rq_unlock(rq, task, &rq_flags); } return 0; } void schedtune_cancel_attach(struct cgroup_taskset *tset) { /* This can happen only if SchedTune controller is mounted with * other hierarchies ane one of them fails. Since usually SchedTune is * mouted on its own hierarcy, for the time being we do not implement * a proper rollback mechanism */ WARN(1, "SchedTune cancel attach not implemented"); } static void schedtune_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; cgroup_taskset_for_each(task, css, tset) sync_band(task, css_st(css)->band); } /* * NOTE: This function must be called while holding the lock on the CPU RQ */ void schedtune_dequeue_task(struct task_struct *p, int cpu) { struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); unsigned long irq_flags; struct schedtune *st; int idx; if (unlikely(!schedtune_initialized)) return; /* * Boost group accouting is protected by a per-cpu lock and requires * interrupt to be disabled to avoid race conditions on... */ raw_spin_lock_irqsave(&bg->lock, irq_flags); rcu_read_lock(); st = task_schedtune(p); idx = st->idx; schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK); rcu_read_unlock(); raw_spin_unlock_irqrestore(&bg->lock, irq_flags); } int schedtune_cpu_boost(int cpu) { struct boost_groups *bg; u64 now; bg = &per_cpu(cpu_boost_groups, cpu); now = sched_clock_cpu(cpu); /* Check to see if we have a hold in effect */ if (schedtune_boost_timeout(now, bg->boost_ts)) schedtune_cpu_update(cpu, now); return bg->boost_max; } int schedtune_task_boost(struct task_struct *p) { struct schedtune *st; int task_boost; if (unlikely(!schedtune_initialized)) return 0; /* Get task boost value */ rcu_read_lock(); st = task_schedtune(p); task_boost = st->boost; rcu_read_unlock(); return task_boost; } int schedtune_util_est_en(struct task_struct *p) { struct schedtune *st; int util_est_en; if (unlikely(!schedtune_initialized)) return 0; /* Get util_est value */ rcu_read_lock(); st = task_schedtune(p); util_est_en = st->util_est_en; rcu_read_unlock(); return util_est_en; } int schedtune_ontime_en(struct task_struct *p) { struct schedtune *st; int ontime_en; if (unlikely(!schedtune_initialized)) return 0; /* Get ontime value */ rcu_read_lock(); st = task_schedtune(p); ontime_en = st->ontime_en; rcu_read_unlock(); return ontime_en; } int schedtune_prefer_idle(struct task_struct *p) { struct schedtune *st; int prefer_idle; if (unlikely(!schedtune_initialized)) return 0; /* Get prefer_idle value */ rcu_read_lock(); st = task_schedtune(p); prefer_idle = st->prefer_idle; rcu_read_unlock(); return prefer_idle; } int schedtune_prefer_perf(struct task_struct *p) { struct schedtune *st; int prefer_perf; if (unlikely(!schedtune_initialized)) return 0; /* Get prefer_perf value */ rcu_read_lock(); st = task_schedtune(p); prefer_perf = max(st->prefer_perf, kpp_status(st->idx)); rcu_read_unlock(); return prefer_perf; } static u64 util_est_en_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct schedtune *st = css_st(css); return st->util_est_en; } static int util_est_en_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 util_est_en) { struct schedtune *st = css_st(css); st->util_est_en = util_est_en; return 0; } static u64 ontime_en_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct schedtune *st = css_st(css); return st->ontime_en; } static int ontime_en_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 ontime_en) { struct schedtune *st = css_st(css); st->ontime_en = ontime_en; return 0; } static u64 band_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct schedtune *st = css_st(css); return st->band; } static int band_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 band) { struct schedtune *st = css_st(css); st->band = band; return 0; } static u64 prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct schedtune *st = css_st(css); return st->prefer_idle; } static int prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 prefer_idle) { struct schedtune *st = css_st(css); st->prefer_idle = !!prefer_idle; return 0; } static u64 prefer_perf_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct schedtune *st = css_st(css); return st->prefer_perf; } static int prefer_perf_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 prefer_perf) { struct schedtune *st = css_st(css); st->prefer_perf = prefer_perf; return 0; } static s64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct schedtune *st = css_st(css); return st->boost; } static int boost_write(struct cgroup_subsys_state *css, struct cftype *cft, s64 boost) { struct schedtune *st = css_st(css); if (boost < 0 || boost > 100) return -EINVAL; st->boost = boost; /* Update CPU boost */ schedtune_boostgroup_update(st->idx, st->boost); return 0; } static struct cftype files[] = { { .name = "boost", .read_s64 = boost_read, .write_s64 = boost_write, }, { .name = "prefer_idle", .read_u64 = prefer_idle_read, .write_u64 = prefer_idle_write, }, { .name = "prefer_perf", .read_u64 = prefer_perf_read, .write_u64 = prefer_perf_write, }, { .name = "band", .read_u64 = band_read, .write_u64 = band_write, }, { .name = "util_est_en", .read_u64 = util_est_en_read, .write_u64 = util_est_en_write, }, { .name = "ontime_en", .read_u64 = ontime_en_read, .write_u64 = ontime_en_write, }, { } /* terminate */ }; static int schedtune_boostgroup_init(struct schedtune *st) { struct boost_groups *bg; int cpu; /* Keep track of allocated boost groups */ allocated_group[st->idx] = st; /* Initialize the per CPU boost groups */ for_each_possible_cpu(cpu) { bg = &per_cpu(cpu_boost_groups, cpu); bg->group[st->idx].boost = 0; bg->group[st->idx].tasks = 0; bg->group[st->idx].ts = 0; } return 0; } static struct cgroup_subsys_state * schedtune_css_alloc(struct cgroup_subsys_state *parent_css) { struct schedtune *st; int idx; if (!parent_css) return &root_schedtune.css; /* Allow only single level hierachies */ if (parent_css != &root_schedtune.css) { pr_err("Nested SchedTune boosting groups not allowed\n"); return ERR_PTR(-ENOMEM); } /* Allow only a limited number of boosting groups */ for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) if (!allocated_group[idx]) break; if (idx == BOOSTGROUPS_COUNT) { pr_err("Trying to create more than %d SchedTune boosting groups\n", BOOSTGROUPS_COUNT); return ERR_PTR(-ENOSPC); } st = kzalloc(sizeof(*st), GFP_KERNEL); if (!st) goto out; /* Initialize per CPUs boost group support */ st->idx = idx; if (schedtune_boostgroup_init(st)) goto release; return &st->css; release: kfree(st); out: return ERR_PTR(-ENOMEM); } static void schedtune_boostgroup_release(struct schedtune *st) { /* Reset this boost group */ schedtune_boostgroup_update(st->idx, 0); /* Keep track of allocated boost groups */ allocated_group[st->idx] = NULL; } static void schedtune_css_free(struct cgroup_subsys_state *css) { struct schedtune *st = css_st(css); schedtune_boostgroup_release(st); kfree(st); } struct cgroup_subsys schedtune_cgrp_subsys = { .css_alloc = schedtune_css_alloc, .css_free = schedtune_css_free, .can_attach = schedtune_can_attach, .cancel_attach = schedtune_cancel_attach, .attach = schedtune_attach, .legacy_cftypes = files, .early_init = 1, }; static inline void schedtune_init_cgroups(void) { struct boost_groups *bg; int cpu; /* Initialize the per CPU boost groups */ for_each_possible_cpu(cpu) { bg = &per_cpu(cpu_boost_groups, cpu); memset(bg, 0, sizeof(struct boost_groups)); raw_spin_lock_init(&bg->lock); } pr_info("schedtune: configured to support %d boost groups\n", BOOSTGROUPS_COUNT); schedtune_initialized = true; } /* * Initialize the cgroup structures */ static int schedtune_init(void) { schedtune_spc_rdiv = reciprocal_value(100); schedtune_init_cgroups(); return 0; } postcore_initcall(schedtune_init);