aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2023-08-10 09:05:43 +0200
committerIngo Molnar <mingo@kernel.org>2023-08-10 09:05:43 +0200
commitb41bbb33cf75d251a816768580819aec17be718d (patch)
treef50ecdfa8f7608af122e6215b7a967265085fb32
parent88c56cfeaec4642aee8aac58b38d5708c6aae0d3 (diff)
parentd07f09a1f99cabbc86bc5c97d962eb8a466106b5 (diff)
Merge branch 'sched/eevdf' into sched/core
Pick up the EEVDF work into the main branch - it's looking good so far. Conflicts: kernel/sched/features.h Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--include/linux/rbtree_augmented.h26
-rw-r--r--include/linux/sched.h7
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/debug.c48
-rw-r--r--kernel/sched/fair.c1091
-rw-r--r--kernel/sched/features.h23
-rw-r--r--kernel/sched/sched.h15
7 files changed, 563 insertions, 649 deletions
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
index 7ee7ed5de722..6dbc5a1bf6a8 100644
--- a/include/linux/rbtree_augmented.h
+++ b/include/linux/rbtree_augmented.h
@@ -60,6 +60,32 @@ rb_insert_augmented_cached(struct rb_node *node,
rb_insert_augmented(node, &root->rb_root, augment);
}
+static __always_inline struct rb_node *
+rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree,
+ bool (*less)(struct rb_node *, const struct rb_node *),
+ const struct rb_augment_callbacks *augment)
+{
+ struct rb_node **link = &tree->rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ bool leftmost = true;
+
+ while (*link) {
+ parent = *link;
+ if (less(node, parent)) {
+ link = &parent->rb_left;
+ } else {
+ link = &parent->rb_right;
+ leftmost = false;
+ }
+ }
+
+ rb_link_node(node, parent, link);
+ augment->propagate(parent, NULL); /* suboptimal */
+ rb_insert_augmented_cached(node, tree, leftmost, augment);
+
+ return leftmost ? node : NULL;
+}
+
/*
* Template for declaring augmented rbtree callbacks (generic case)
*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2aab7be46f7e..177b3f3676ef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -549,13 +549,18 @@ struct sched_entity {
/* For load-balancing: */
struct load_weight load;
struct rb_node run_node;
+ u64 deadline;
+ u64 min_deadline;
+
struct list_head group_node;
unsigned int on_rq;
u64 exec_start;
u64 sum_exec_runtime;
- u64 vruntime;
u64 prev_sum_exec_runtime;
+ u64 vruntime;
+ s64 vlag;
+ u64 slice;
u64 nr_migrations;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 614271a75525..a97eab3e775a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4527,6 +4527,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
+ p->se.vlag = 0;
+ p->se.slice = sysctl_sched_base_slice;
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index aeeba46a096b..4c3d0d9f3db6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -347,10 +347,7 @@ static __init int sched_init_debug(void)
debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
#endif
- debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
- debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
- debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity);
- debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
+ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice);
debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
@@ -582,9 +579,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
else
SEQ_printf(m, " %c", task_state_to_char(p));
- SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ",
+ SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
+ entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
+ SPLIT_NS(p->se.deadline),
+ SPLIT_NS(p->se.slice),
+ SPLIT_NS(p->se.sum_exec_runtime),
(long long)(p->nvcsw + p->nivcsw),
p->prio);
@@ -627,10 +628,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
- s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
- spread, rq0_min_vruntime, spread0;
+ s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread;
+ struct sched_entity *last, *first;
struct rq *rq = cpu_rq(cpu);
- struct sched_entity *last;
unsigned long flags;
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -644,26 +644,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(cfs_rq->exec_clock));
raw_spin_rq_lock_irqsave(rq, flags);
- if (rb_first_cached(&cfs_rq->tasks_timeline))
- MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
+ first = __pick_first_entity(cfs_rq);
+ if (first)
+ left_vruntime = first->vruntime;
last = __pick_last_entity(cfs_rq);
if (last)
- max_vruntime = last->vruntime;
+ right_vruntime = last->vruntime;
min_vruntime = cfs_rq->min_vruntime;
- rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
raw_spin_rq_unlock_irqrestore(rq, flags);
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
- SPLIT_NS(MIN_vruntime));
+
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime",
+ SPLIT_NS(left_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
SPLIT_NS(min_vruntime));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
- SPLIT_NS(max_vruntime));
- spread = max_vruntime - MIN_vruntime;
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
- SPLIT_NS(spread));
- spread0 = min_vruntime - rq0_min_vruntime;
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
- SPLIT_NS(spread0));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
+ SPLIT_NS(avg_vruntime(cfs_rq)));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
+ SPLIT_NS(right_vruntime));
+ spread = right_vruntime - left_vruntime;
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
@@ -864,10 +863,7 @@ static void sched_debug_header(struct seq_file *m)
SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
#define PN(x) \
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
- PN(sysctl_sched_latency);
- PN(sysctl_sched_min_granularity);
- PN(sysctl_sched_idle_min_granularity);
- PN(sysctl_sched_wakeup_granularity);
+ PN(sysctl_sched_base_slice);
P(sysctl_sched_child_runs_first);
P(sysctl_sched_features);
#undef PN
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c28206499a3d..f496cef90ce7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -47,6 +47,7 @@
#include <linux/psi.h>
#include <linux/ratelimit.h>
#include <linux/task_work.h>
+#include <linux/rbtree_augmented.h>
#include <asm/switch_to.h>
@@ -57,22 +58,6 @@
#include "autogroup.h"
/*
- * Targeted preemption latency for CPU-bound tasks:
- *
- * NOTE: this latency value is not the same as the concept of
- * 'timeslice length' - timeslices in CFS are of variable length
- * and have no persistent notion like in traditional, time-slice
- * based scheduling concepts.
- *
- * (to see the precise effective timeslice length of your workload,
- * run vmstat and monitor the context-switches (cs) field)
- *
- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
- */
-unsigned int sysctl_sched_latency = 6000000ULL;
-static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
-
-/*
* The initial- and re-scaling of tunables is configurable
*
* Options are:
@@ -90,21 +75,8 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
*
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 750000ULL;
-static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
-
-/*
- * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
- * Applies only when SCHED_IDLE tasks compete with normal tasks.
- *
- * (default: 0.75 msec)
- */
-unsigned int sysctl_sched_idle_min_granularity = 750000ULL;
-
-/*
- * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
- */
-static unsigned int sched_nr_latency = 8;
+unsigned int sysctl_sched_base_slice = 750000ULL;
+static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
/*
* After fork, child runs first. If set to 0 (default) then
@@ -112,18 +84,6 @@ static unsigned int sched_nr_latency = 8;
*/
unsigned int sysctl_sched_child_runs_first __read_mostly;
-/*
- * SCHED_OTHER wake-up granularity.
- *
- * This option delays the preemption effects of decoupled workloads
- * and reduces their over-scheduling. Synchronous workloads will still
- * have immediate wakeup/sleep latencies.
- *
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
- */
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
-static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
-
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
int sched_thermal_decay_shift;
@@ -277,9 +237,7 @@ static void update_sysctl(void)
#define SET_SYSCTL(name) \
(sysctl_##name = (factor) * normalized_sysctl_##name)
- SET_SYSCTL(sched_min_granularity);
- SET_SYSCTL(sched_latency);
- SET_SYSCTL(sched_wakeup_granularity);
+ SET_SYSCTL(sched_base_slice);
#undef SET_SYSCTL
}
@@ -347,6 +305,16 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
return mul_u64_u32_shr(delta_exec, fact, shift);
}
+/*
+ * delta /= w
+ */
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
+{
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
+
+ return delta;
+}
const struct sched_class fair_sched_class;
@@ -601,13 +569,198 @@ static inline bool entity_before(const struct sched_entity *a,
return (s64)(a->vruntime - b->vruntime) < 0;
}
+static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ return (s64)(se->vruntime - cfs_rq->min_vruntime);
+}
+
#define __node_2_se(node) \
rb_entry((node), struct sched_entity, run_node)
+/*
+ * Compute virtual time from the per-task service numbers:
+ *
+ * Fair schedulers conserve lag:
+ *
+ * \Sum lag_i = 0
+ *
+ * Where lag_i is given by:
+ *
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * Where S is the ideal service time and V is it's virtual time counterpart.
+ * Therefore:
+ *
+ * \Sum lag_i = 0
+ * \Sum w_i * (V - v_i) = 0
+ * \Sum w_i * V - w_i * v_i = 0
+ *
+ * From which we can solve an expression for V in v_i (which we have in
+ * se->vruntime):
+ *
+ * \Sum v_i * w_i \Sum v_i * w_i
+ * V = -------------- = --------------
+ * \Sum w_i W
+ *
+ * Specifically, this is the weighted average of all entity virtual runtimes.
+ *
+ * [[ NOTE: this is only equal to the ideal scheduler under the condition
+ * that join/leave operations happen at lag_i = 0, otherwise the
+ * virtual time has non-continguous motion equivalent to:
+ *
+ * V +-= lag_i / W
+ *
+ * Also see the comment in place_entity() that deals with this. ]]
+ *
+ * However, since v_i is u64, and the multiplcation could easily overflow
+ * transform it into a relative form that uses smaller quantities:
+ *
+ * Substitute: v_i == (v_i - v0) + v0
+ *
+ * \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i
+ * V = ---------------------------- = --------------------- + v0
+ * W W
+ *
+ * Which we track using:
+ *
+ * v0 := cfs_rq->min_vruntime
+ * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
+ * \Sum w_i := cfs_rq->avg_load
+ *
+ * Since min_vruntime is a monotonic increasing variable that closely tracks
+ * the per-task service, these deltas: (v_i - v), will be in the order of the
+ * maximal (virtual) lag induced in the system due to quantisation.
+ *
+ * Also, we use scale_load_down() to reduce the size.
+ *
+ * As measured, the max (key * weight) value was ~44 bits for a kernel build.
+ */
+static void
+avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = scale_load_down(se->load.weight);
+ s64 key = entity_key(cfs_rq, se);
+
+ cfs_rq->avg_vruntime += key * weight;
+ cfs_rq->avg_load += weight;
+}
+
+static void
+avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = scale_load_down(se->load.weight);
+ s64 key = entity_key(cfs_rq, se);
+
+ cfs_rq->avg_vruntime -= key * weight;
+ cfs_rq->avg_load -= weight;
+}
+
+static inline
+void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+{
+ /*
+ * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
+ */
+ cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
+}
+
+u64 avg_vruntime(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ s64 avg = cfs_rq->avg_vruntime;
+ long load = cfs_rq->avg_load;
+
+ if (curr && curr->on_rq) {
+ unsigned long weight = scale_load_down(curr->load.weight);
+
+ avg += entity_key(cfs_rq, curr) * weight;
+ load += weight;
+ }
+
+ if (load)
+ avg = div_s64(avg, load);
+
+ return cfs_rq->min_vruntime + avg;
+}
+
+/*
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * However, since V is approximated by the weighted average of all entities it
+ * is possible -- by addition/removal/reweight to the tree -- to move V around
+ * and end up with a larger lag than we started with.
+ *
+ * Limit this to either double the slice length with a minimum of TICK_NSEC
+ * since that is the timing granularity.
+ *
+ * EEVDF gives the following limit for a steady state system:
+ *
+ * -r_max < lag < max(r_max, q)
+ *
+ * XXX could add max_slice to the augmented data to track this.
+ */
+void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ s64 lag, limit;
+
+ SCHED_WARN_ON(!se->on_rq);
+ lag = avg_vruntime(cfs_rq) - se->vruntime;
+
+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+ se->vlag = clamp(lag, -limit, limit);
+}
+
+/*
+ * Entity is eligible once it received less service than it ought to have,
+ * eg. lag >= 0.
+ *
+ * lag_i = S - s_i = w_i*(V - v_i)
+ *
+ * lag_i >= 0 -> V >= v_i
+ *
+ * \Sum (v_i - v)*w_i
+ * V = ------------------ + v
+ * \Sum w_i
+ *
+ * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
+ *
+ * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
+ * to the loss in precision caused by the division.
+ */
+int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ s64 avg = cfs_rq->avg_vruntime;
+ long load = cfs_rq->avg_load;
+
+ if (curr && curr->on_rq) {
+ unsigned long weight = scale_load_down(curr->load.weight);
+
+ avg += entity_key(cfs_rq, curr) * weight;
+ load += weight;
+ }
+
+ return avg >= entity_key(cfs_rq, se) * load;
+}
+
+static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
+{
+ u64 min_vruntime = cfs_rq->min_vruntime;
+ /*
+ * open coded max_vruntime() to allow updating avg_vruntime
+ */
+ s64 delta = (s64)(vruntime - min_vruntime);
+ if (delta > 0) {
+ avg_vruntime_update(cfs_rq, delta);
+ min_vruntime = vruntime;
+ }
+ return min_vruntime;
+}
+
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *curr = cfs_rq->curr;
- struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
u64 vruntime = cfs_rq->min_vruntime;
@@ -618,9 +771,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
curr = NULL;
}
- if (leftmost) { /* non-empty tree */
- struct sched_entity *se = __node_2_se(leftmost);
-
+ if (se) {
if (!curr)
vruntime = se->vruntime;
else
@@ -629,7 +780,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
/* ensure we never gain time by being placed backwards. */
u64_u32_store(cfs_rq->min_vruntime,
- max_vruntime(cfs_rq->min_vruntime, vruntime));
+ __update_min_vruntime(cfs_rq, vruntime));
}
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
@@ -637,17 +788,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
return entity_before(__node_2_se(a), __node_2_se(b));
}
+#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
+
+static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node)
+{
+ if (node) {
+ struct sched_entity *rse = __node_2_se(node);
+ if (deadline_gt(min_deadline, se, rse))
+ se->min_deadline = rse->min_deadline;
+ }
+}
+
+/*
+ * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline)
+ */
+static inline bool min_deadline_update(struct sched_entity *se, bool exit)
+{
+ u64 old_min_deadline = se->min_deadline;
+ struct rb_node *node = &se->run_node;
+
+ se->min_deadline = se->deadline;
+ __update_min_deadline(se, node->rb_right);
+ __update_min_deadline(se, node->rb_left);
+
+ return se->min_deadline == old_min_deadline;
+}
+
+RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
+ run_node, min_deadline, min_deadline_update);
+
/*
* Enqueue an entity into the rb-tree:
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
+ avg_vruntime_add(cfs_rq, se);
+ se->min_deadline = se->deadline;
+ rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ __entity_less, &min_deadline_cb);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ &min_deadline_cb);
+ avg_vruntime_sub(cfs_rq, se);
}
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
@@ -660,14 +845,81 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
return __node_2_se(left);
}
-static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+/*
+ * Earliest Eligible Virtual Deadline First
+ *
+ * In order to provide latency guarantees for different request sizes
+ * EEVDF selects the best runnable task from two criteria:
+ *
+ * 1) the task must be eligible (must be owed service)
+ *
+ * 2) from those tasks that meet 1), we select the one
+ * with the earliest virtual deadline.
+ *
+ * We can do this in O(log n) time due to an augmented RB-tree. The
+ * tree keeps the entries sorted on service, but also functions as a
+ * heap based on the deadline by keeping:
+ *
+ * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
+ *
+ * Which allows an EDF like search on (sub)trees.
+ */
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
{
- struct rb_node *next = rb_next(&se->run_node);
+ struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
+ struct sched_entity *curr = cfs_rq->curr;
+ struct sched_entity *best = NULL;
- if (!next)
- return NULL;
+ if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
+ curr = NULL;
+
+ while (node) {
+ struct sched_entity *se = __node_2_se(node);
- return __node_2_se(next);
+ /*
+ * If this entity is not eligible, try the left subtree.
+ */
+ if (!entity_eligible(cfs_rq, se)) {
+ node = node->rb_left;
+ continue;
+ }
+
+ /*
+ * If this entity has an earlier deadline than the previous
+ * best, take this one. If it also has the earliest deadline
+ * of its subtree, we're done.
+ */
+ if (!best || deadline_gt(deadline, best, se)) {
+ best = se;
+ if (best->deadline == best->min_deadline)
+ break;
+ }
+
+ /*
+ * If the earlest deadline in this subtree is in the fully
+ * eligible left half of our space, go there.
+ */
+ if (node->rb_left &&
+ __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
+ node = node->rb_left;
+ continue;
+ }
+
+ node = node->rb_right;
+ }
+
+ if (!best || (curr && deadline_gt(deadline, best, curr)))
+ best = curr;
+
+ if (unlikely(!best)) {
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
+ if (left) {
+ pr_err("EEVDF scheduling fail, picking leftmost\n");
+ return left;
+ }
+ }
+
+ return best;
}
#ifdef CONFIG_SCHED_DEBUG
@@ -689,104 +941,45 @@ int sched_update_scaling(void)
{
unsigned int factor = get_update_sysctl_factor();
- sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
- sysctl_sched_min_granularity);
-
#define WRT_SYSCTL(name) \
(normalized_sysctl_##name = sysctl_##name / (factor))
- WRT_SYSCTL(sched_min_granularity);
- WRT_SYSCTL(sched_latency);
- WRT_SYSCTL(sched_wakeup_granularity);
+ WRT_SYSCTL(sched_base_slice);
#undef WRT_SYSCTL
return 0;
}
#endif
-/*
- * delta /= w
- */
-static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
-{
- if (unlikely(se->load.weight != NICE_0_LOAD))
- delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
-
- return delta;
-}
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
/*
- * The idea is to set a period in which each task runs once.
- *
- * When there are too many tasks (sched_nr_latency) we have to stretch
- * this period because otherwise the slices get too small.
- *
- * p = (nr <= nl) ? l : l*nr/nl
+ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
+ * this is probably good enough.
*/
-static u64 __sched_period(unsigned long nr_running)
+static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (unlikely(nr_running > sched_nr_latency))
- return nr_running * sysctl_sched_min_granularity;
- else
- return sysctl_sched_latency;
-}
-
-static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);
-
-/*
- * We calculate the wall-time slice from the period by taking a part
- * proportional to the weight.
- *
- * s = p*P[w/rw]
- */
-static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- unsigned int nr_running = cfs_rq->nr_running;
- struct sched_entity *init_se = se;
- unsigned int min_gran;
- u64 slice;
-
- if (sched_feat(ALT_PERIOD))
- nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
-
- slice = __sched_period(nr_running + !se->on_rq);
-
- for_each_sched_entity(se) {
- struct load_weight *load;
- struct load_weight lw;
- struct cfs_rq *qcfs_rq;
-
- qcfs_rq = cfs_rq_of(se);
- load = &qcfs_rq->load;
-
- if (unlikely(!se->on_rq)) {
- lw = qcfs_rq->load;
+ if ((s64)(se->vruntime - se->deadline) < 0)
+ return;
- update_load_add(&lw, se->load.weight);
- load = &lw;
- }
- slice = __calc_delta(slice, se->load.weight, load);
- }
+ /*
+ * For EEVDF the virtual time slope is determined by w_i (iow.
+ * nice) while the request time r_i is determined by
+ * sysctl_sched_base_slice.
+ */
+ se->slice = sysctl_sched_base_slice;
- if (sched_feat(BASE_SLICE)) {
- if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
- min_gran = sysctl_sched_idle_min_granularity;
- else
- min_gran = sysctl_sched_min_granularity;
+ /*
+ * EEVDF: vd_i = ve_i + r_i / w_i
+ */
+ se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
- slice = max_t(u64, slice, min_gran);
+ /*
+ * The task has consumed its request, reschedule.
+ */
+ if (cfs_rq->nr_running > 1) {
+ resched_curr(rq_of(cfs_rq));
+ clear_buddies(cfs_rq, se);
}
-
- return slice;
-}
-
-/*
- * We calculate the vruntime slice of a to-be-inserted task.
- *
- * vs = s/w
- */
-static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
#include "pelt.h"
@@ -921,6 +1114,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
schedstat_add(cfs_rq->exec_clock, delta_exec);
curr->vruntime += calc_delta_fair(delta_exec, curr);
+ update_deadline(cfs_rq, curr);
update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
@@ -3375,16 +3569,36 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
+ unsigned long old_weight = se->load.weight;
+
if (se->on_rq) {
/* commit outstanding execution time */
if (cfs_rq->curr == se)
update_curr(cfs_rq);
+ else
+ avg_vruntime_sub(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
update_load_set(&se->load, weight);
+ if (!se->on_rq) {
+ /*
+ * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
+ * we need to scale se->vlag when w_i changes.
+ */
+ se->vlag = div_s64(se->vlag * old_weight, weight);
+ } else {
+ s64 deadline = se->deadline - se->vruntime;
+ /*
+ * When the weight changes, the virtual time slope changes and
+ * we should adjust the relative virtual deadline accordingly.
+ */
+ deadline = div_s64(deadline * old_weight, weight);
+ se->deadline = se->vruntime + deadline;
+ }
+
#ifdef CONFIG_SMP
do {
u32 divider = get_pelt_divider(&se->avg);
@@ -3394,9 +3608,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
#endif
enqueue_load_avg(cfs_rq, se);
- if (se->on_rq)
+ if (se->on_rq) {
update_load_add(&cfs_rq->load, se->load.weight);
-
+ if (cfs_rq->curr != se)
+ avg_vruntime_add(cfs_rq, se);
+ }
}
void reweight_task(struct task_struct *p, int prio)
@@ -4692,98 +4908,103 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
#endif /* CONFIG_SMP */
-static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-#ifdef CONFIG_SCHED_DEBUG
- s64 d = se->vruntime - cfs_rq->min_vruntime;
-
- if (d < 0)
- d = -d;
-
- if (d > 3*sysctl_sched_latency)
- schedstat_inc(cfs_rq->nr_spread_over);
-#endif
-}
-
-static inline bool entity_is_long_sleeper(struct sched_entity *se)
+static void
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- struct cfs_rq *cfs_rq;
- u64 sleep_time;
-
- if (se->exec_start == 0)
- return false;
+ u64 vslice = calc_delta_fair(se->slice, se);
+ u64 vruntime = avg_vruntime(cfs_rq);
+ s64 lag = 0;
- cfs_rq = cfs_rq_of(se);
-
- sleep_time = rq_clock_task(rq_of(cfs_rq));
+ /*
+ * Due to how V is constructed as the weighted average of entities,
+ * adding tasks with positive lag, or removing tasks with negative lag
+ * will move 'time' backwards, this can screw around with the lag of
+ * other tasks.
+ *
+ * EEVDF: placement strategy #1 / #2
+ */
+ if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
+ struct sched_entity *curr = cfs_rq->curr;
+ unsigned long load;
- /* Happen while migrating because of clock task divergence */
- if (sleep_time <= se->exec_start)
- return false;
+ lag = se->vlag;
- sleep_time -= se->exec_start;
- if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
- return true;
+ /*
+ * If we want to place a task and preserve lag, we have to
+ * consider the effect of the new entity on the weighted
+ * average and compensate for this, otherwise lag can quickly
+ * evaporate.
+ *
+ * Lag is defined as:
+ *
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * To avoid the 'w_i' term all over the place, we only track
+ * the virtual lag:
+ *
+ * vl_i = V - v_i <=> v_i = V - vl_i
+ *
+ * And we take V to be the weighted average of all v:
+ *
+ * V = (\Sum w_j*v_j) / W
+ *
+ * Where W is: \Sum w_j
+ *
+ * Then, the weighted average after adding an entity with lag
+ * vl_i is given by:
+ *
+ * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
+ * = (W*V + w_i*(V - vl_i)) / (W + w_i)
+ * = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
+ * = (V*(W + w_i) - w_i*l) / (W + w_i)
+ * = V - w_i*vl_i / (W + w_i)
+ *
+ * And the actual lag after adding an entity with vl_i is:
+ *
+ * vl'_i = V' - v_i
+ * = V - w_i*vl_i / (W + w_i) - (V - vl_i)
+ * = vl_i - w_i*vl_i / (W + w_i)
+ *
+ * Which is strictly less than vl_i. So in order to preserve lag
+ * we should inflate the lag before placement such that the
+ * effective lag after placement comes out right.
+ *
+ * As such, invert the above relation for vl'_i to get the vl_i
+ * we need to use such that the lag after placement is the lag
+ * we computed before dequeue.
+ *
+ * vl'_i = vl_i - w_i*vl_i / (W + w_i)
+ * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
+ *
+ * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
+ * = W*vl_i
+ *
+ * vl_i = (W + w_i)*vl'_i / W
+ */
+ load = cfs_rq->avg_load;
+ if (curr && curr->on_rq)
+ load += scale_load_down(curr->load.weight);
- return false;
-}
+ lag *= load + scale_load_down(se->load.weight);
+ if (WARN_ON_ONCE(!load))
+ load = 1;
+ lag = div_s64(lag, load);
+ }
-static void
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
-{
- u64 vruntime = cfs_rq->min_vruntime;
+ se->vruntime = vruntime - lag;
/*
- * The 'current' period is already promised to the current tasks,
- * however the extra weight of the new task will slow them down a
- * little, place the new task so that it fits in the slot that
- * stays open at the end.
+ * When joining the competition; the exisiting tasks will be,
+ * on average, halfway through their slice, as such start tasks
+ * off with half a slice to ease into the competition.
*/
- if (initial && sched_feat(START_DEBIT))
- vruntime += sched_vslice(cfs_rq, se);
+ if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
+ vslice /= 2;
- /* sleeps up to a single latency don't count. */
- if (!initial) {
- unsigned long thresh;
-
- if (se_is_idle(se))
- thresh = sysctl_sched_min_granularity;
- else
- thresh = sysctl_sched_latency;
-
- /*
- * Halve their sleep time's effect, to allow
- * for a gentler effect of sleepers:
- */
- if (sched_feat(GENTLE_FAIR_SLEEPERS))
- thresh >>= 1;
-
- vruntime -= thresh;
- }
-
- /*
- * Pull vruntime of the entity being placed to the base level of
- * cfs_rq, to prevent boosting it if placed backwards.
- * However, min_vruntime can advance much faster than real time, with
- * the extreme being when an entity with the minimal weight always runs
- * on the cfs_rq. If the waking entity slept for a long time, its
- * vruntime difference from min_vruntime may overflow s64 and their
- * comparison may get inversed, so ignore the entity's original
- * vruntime in that case.
- * The maximal vruntime speedup is given by the ratio of normal to
- * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES.
- * When placing a migrated waking entity, its exec_start has been set
- * from a different rq. In order to take into account a possible
- * divergence between new and prev rq's clocks task because of irq and
- * stolen time, we take an additional margin.
- * So, cutting off on the sleep time of
- * 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days
- * should be safe.
- */
- if (entity_is_long_sleeper(se))
- se->vruntime = vruntime;
- else
- se->vruntime = max_vruntime(se->vruntime, vruntime);
+ /*
+ * EEVDF: vd_i = ve_i + r_i/w_i
+ */
+ se->deadline = se->vruntime + vslice;
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -4791,61 +5012,21 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
static inline bool cfs_bandwidth_used(void);
-/*
- * MIGRATION
- *
- * dequeue
- * update_curr()
- * update_min_vruntime()
- * vruntime -= min_vruntime
- *
- * enqueue
- * update_curr()
- * update_min_vruntime()
- * vruntime += min_vruntime
- *