diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-16 17:25:49 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-16 17:25:49 -0700 |
| commit | 7e67a859997aad47727aff9c5a32e160da079ce3 (patch) | |
| tree | 96f53425c2834de5b3276d7598782ab6412e4d5e /kernel | |
| parent | 772c1d06bd402f7ee72c61a18c2db74cd74b6758 (diff) | |
| parent | 563c4f85f9f0d63b712081d5b4522152cdcb8b6b (diff) | |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
- MAINTAINERS: Add Mark Rutland as perf submaintainer, Juri Lelli and
Vincent Guittot as scheduler submaintainers. Add Dietmar Eggemann,
Steven Rostedt, Ben Segall and Mel Gorman as scheduler reviewers.
As perf and the scheduler is getting bigger and more complex,
document the status quo of current responsibilities and interests,
and spread the review pain^H^H^H^H fun via an increase in the Cc:
linecount generated by scripts/get_maintainer.pl. :-)
- Add another series of patches that brings the -rt (PREEMPT_RT) tree
closer to mainline: split the monolithic CONFIG_PREEMPT dependencies
into a new CONFIG_PREEMPTION category that will allow the eventual
introduction of CONFIG_PREEMPT_RT. Still a few more hundred patches
to go though.
- Extend the CPU cgroup controller with uclamp.min and uclamp.max to
allow the finer shaping of CPU bandwidth usage.
- Micro-optimize energy-aware wake-ups from O(CPUS^2) to O(CPUS).
- Improve the behavior of high CPU count, high thread count
applications running under cpu.cfs_quota_us constraints.
- Improve balancing with SCHED_IDLE (SCHED_BATCH) tasks present.
- Improve CPU isolation housekeeping CPU allocation NUMA locality.
- Fix deadline scheduler bandwidth calculations and logic when cpusets
rebuilds the topology, or when it gets deadline-throttled while it's
being offlined.
- Convert the cpuset_mutex to percpu_rwsem, to allow it to be used from
setscheduler() system calls without creating global serialization.
Add new synchronization between cpuset topology-changing events and
the deadline acceptance tests in setscheduler(), which were broken
before.
- Rework the active_mm state machine to be less confusing and more
optimal.
- Rework (simplify) the pick_next_task() slowpath.
- Improve load-balancing on AMD EPYC systems.
- ... and misc cleanups, smaller fixes and improvements - please see
the Git log for more details.
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (53 commits)
sched/psi: Correct overly pessimistic size calculation
sched/fair: Speed-up energy-aware wake-ups
sched/uclamp: Always use 'enum uclamp_id' for clamp_id values
sched/uclamp: Update CPU's refcount on TG's clamp changes
sched/uclamp: Use TG's clamps to restrict TASK's clamps
sched/uclamp: Propagate system defaults to the root group
sched/uclamp: Propagate parent clamps
sched/uclamp: Extend CPU's cgroup controller
sched/topology: Improve load balancing on AMD EPYC systems
arch, ia64: Make NUMA select SMP
sched, perf: MAINTAINERS update, add submaintainers and reviewers
sched/fair: Use rq_lock/unlock in online_fair_sched_group
cpufreq: schedutil: fix equation in comment
sched: Rework pick_next_task() slow-path
sched: Allow put_prev_task() to drop rq->lock
sched/fair: Expose newidle_balance()
sched: Add task_struct pointer to sched_class::set_curr_task
sched: Rework CPU hotplug task selection
sched/{rt,deadline}: Fix set_next_task vs pick_next_task
sched: Fix kerneldoc comment for ia64_set_curr_task
...
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/cgroup/cgroup.c | 2 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset.c | 163 | ||||
| -rw-r--r-- | kernel/events/core.c | 9 | ||||
| -rw-r--r-- | kernel/irq/manage.c | 3 | ||||
| -rw-r--r-- | kernel/kprobes.c | 2 | ||||
| -rw-r--r-- | kernel/locking/rtmutex.c | 6 | ||||
| -rw-r--r-- | kernel/rcu/Kconfig | 8 | ||||
| -rw-r--r-- | kernel/rcu/tree.c | 12 | ||||
| -rw-r--r-- | kernel/rcu/tree_stall.h | 6 | ||||
| -rw-r--r-- | kernel/sched/core.c | 561 | ||||
| -rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 6 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 134 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 409 | ||||
| -rw-r--r-- | kernel/sched/idle.c | 31 | ||||
| -rw-r--r-- | kernel/sched/isolation.c | 12 | ||||
| -rw-r--r-- | kernel/sched/psi.c | 2 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 74 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 63 | ||||
| -rw-r--r-- | kernel/sched/stats.h | 7 | ||||
| -rw-r--r-- | kernel/sched/stop_task.c | 22 | ||||
| -rw-r--r-- | kernel/sched/topology.c | 53 | ||||
| -rw-r--r-- | kernel/stop_machine.c | 2 | ||||
| -rw-r--r-- | kernel/trace/Kconfig | 6 | ||||
| -rw-r--r-- | kernel/trace/ftrace.c | 2 | ||||
| -rw-r--r-- | kernel/trace/ring_buffer_benchmark.c | 2 | ||||
| -rw-r--r-- | kernel/trace/trace_events.c | 4 | ||||
| -rw-r--r-- | kernel/trace/trace_sched_wakeup.c | 3 |
27 files changed, 1058 insertions, 546 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8be1da1ebd9a..a7ce73a2c401 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1891,7 +1891,7 @@ static int cgroup_reconfigure(struct fs_context *fc) */ static bool use_task_css_set_links __read_mostly; -static void cgroup_enable_task_cg_lists(void) +void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 5aa37531ce76..c52bc91f882b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -45,6 +45,7 @@ #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/sched.h> +#include <linux/sched/deadline.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/seq_file.h> @@ -332,7 +333,18 @@ static struct cpuset top_cpuset = { * guidelines for accessing subsystem state in kernel/cgroup.c */ -static DEFINE_MUTEX(cpuset_mutex); +DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); + +void cpuset_read_lock(void) +{ + percpu_down_read(&cpuset_rwsem); +} + +void cpuset_read_unlock(void) +{ + percpu_up_read(&cpuset_rwsem); +} + static DEFINE_SPINLOCK(callback_lock); static struct workqueue_struct *cpuset_migrate_mm_wq; @@ -894,6 +906,67 @@ done: return ndoms; } +static void update_tasks_root_domain(struct cpuset *cs) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, 0, &it); + + while ((task = css_task_iter_next(&it))) + dl_add_task_root_domain(task); + + css_task_iter_end(&it); +} + +static void rebuild_root_domains(void) +{ + struct cpuset *cs = NULL; + struct cgroup_subsys_state *pos_css; + + percpu_rwsem_assert_held(&cpuset_rwsem); + lockdep_assert_cpus_held(); + lockdep_assert_held(&sched_domains_mutex); + + cgroup_enable_task_cg_lists(); + + rcu_read_lock(); + + /* + * Clear default root domain DL accounting, it will be computed again + * if a task belongs to it. + */ + dl_clear_root_domain(&def_root_domain); + + cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { + + if (cpumask_empty(cs->effective_cpus)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + + css_get(&cs->css); + + rcu_read_unlock(); + + update_tasks_root_domain(cs); + + rcu_read_lock(); + css_put(&cs->css); + } + rcu_read_unlock(); +} + +static void +partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new) +{ + mutex_lock(&sched_domains_mutex); + partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); + rebuild_root_domains(); + mutex_unlock(&sched_domains_mutex); +} + /* * Rebuild scheduler domains. * @@ -911,8 +984,8 @@ static void rebuild_sched_domains_locked(void) cpumask_var_t *doms; int ndoms; - lockdep_assert_held(&cpuset_mutex); - get_online_cpus(); + lockdep_assert_cpus_held(); + percpu_rwsem_assert_held(&cpuset_rwsem); /* * We have raced with CPU hotplug. Don't do anything to avoid @@ -921,19 +994,17 @@ static void rebuild_sched_domains_locked(void) */ if (!top_cpuset.nr_subparts_cpus && !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return; if (top_cpuset.nr_subparts_cpus && !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return; /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ - partition_sched_domains(ndoms, doms, attr); -out: - put_online_cpus(); + partition_and_rebuild_sched_domains(ndoms, doms, attr); } #else /* !CONFIG_SMP */ static void rebuild_sched_domains_locked(void) @@ -943,9 +1014,11 @@ static void rebuild_sched_domains_locked(void) void rebuild_sched_domains(void) { - mutex_lock(&cpuset_mutex); + get_online_cpus(); + percpu_down_write(&cpuset_rwsem); rebuild_sched_domains_locked(); - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); + put_online_cpus(); } /** @@ -1051,7 +1124,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ bool part_error = false; /* Partition error? */ - lockdep_assert_held(&cpuset_mutex); + percpu_rwsem_assert_held(&cpuset_rwsem); /* * The parent must be a partition root. @@ -2039,7 +2112,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); cs = css_cs(css); - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); /* allow moving tasks into an empty cpuset if on default hierarchy */ ret = -ENOSPC; @@ -2063,7 +2136,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) cs->attach_in_progress++; ret = 0; out_unlock: - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); return ret; } @@ -2073,9 +2146,9 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); css_cs(css)->attach_in_progress--; - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); } /* @@ -2098,7 +2171,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); /* prepare for attach */ if (cs == &top_cpuset) @@ -2152,7 +2225,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); } /* The various types of files and directories in a cpuset file system */ @@ -2183,7 +2256,8 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - mutex_lock(&cpuset_mutex); + get_online_cpus(); + percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -2219,7 +2293,8 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); + put_online_cpus(); return retval; } @@ -2230,7 +2305,8 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - mutex_lock(&cpuset_mutex); + get_online_cpus(); + percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2243,7 +2319,8 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); + put_online_cpus(); return retval; } @@ -2282,7 +2359,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); - mutex_lock(&cpuset_mutex); + get_online_cpus(); + percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2306,7 +2384,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_cpuset(trialcs); out_unlock: - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); + put_online_cpus(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2437,13 +2516,15 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, return -EINVAL; css_get(&cs->css); - mutex_lock(&cpuset_mutex); + get_online_cpus(); + percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; retval = update_prstate(cs, val); out_unlock: - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); + put_online_cpus(); css_put(&cs->css); return retval ?: nbytes; } @@ -2649,7 +2730,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (!parent) return 0; - mutex_lock(&cpuset_mutex); + get_online_cpus(); + percpu_down_write(&cpuset_rwsem); set_bit(CS_ONLINE, &cs->flags); if (is_spread_page(parent)) @@ -2700,7 +2782,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); + put_online_cpus(); return 0; } @@ -2719,7 +2802,8 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - mutex_lock(&cpuset_mutex); + get_online_cpus(); + percpu_down_write(&cpuset_rwsem); if (is_partition_root(cs)) update_prstate(cs, 0); @@ -2738,7 +2822,8 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); + put_online_cpus(); } static void cpuset_css_free(struct cgroup_subsys_state *css) @@ -2750,7 +2835,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) static void cpuset_bind(struct cgroup_subsys_state *root_css) { - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); spin_lock_irq(&callback_lock); if (is_in_v2_mode()) { @@ -2763,7 +2848,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) } spin_unlock_irq(&callback_lock); - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); } /* @@ -2805,6 +2890,8 @@ struct cgroup_subsys cpuset_cgrp_subsys = { int __init cpuset_init(void) { + BUG_ON(percpu_init_rwsem(&cpuset_rwsem)); + BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); @@ -2876,7 +2963,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); /* * Move tasks to the nearest ancestor with execution resources, @@ -2886,7 +2973,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, if (is_empty) remove_tasks_in_empty_cpuset(cs); - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); } static void @@ -2936,14 +3023,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); /* * We have raced with task attaching. We wait until attaching * is finished, so we won't attach a task to an empty cpuset. */ if (cs->attach_in_progress) { - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); goto retry; } @@ -3011,7 +3098,7 @@ update_tasks: hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); } /** @@ -3041,7 +3128,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) if (on_dfl && !alloc_cpumasks(NULL, &tmp)) ptmp = &tmp; - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); /* fetch the available cpus/mems and find out which changed how */ cpumask_copy(&new_cpus, cpu_active_mask); @@ -3091,7 +3178,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) update_tasks_nodemask(&top_cpuset); } - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); /* if cpus or mems changed, we need to propagate to descendants */ if (cpus_updated || mems_updated) { diff --git a/kernel/events/core.c b/kernel/events/core.c index 2aad959e6def..1c414b8866b4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4174,10 +4174,8 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task) return NULL; __perf_event_init_context(ctx); - if (task) { - ctx->task = task; - get_task_struct(task); - } + if (task) + ctx->task = get_task_struct(task); ctx->pmu = pmu; return ctx; @@ -10440,8 +10438,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * and we cannot use the ctx information because we need the * pmu before we get a ctx. */ - get_task_struct(task); - event->hw.target = task; + event->hw.target = get_task_struct(task); } event->clock = &local_clock; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e8f7f179bf77..9d50fbe5531a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1255,8 +1255,7 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) * the thread dies to avoid that the interrupt code * references an already freed task_struct. */ - get_task_struct(t); - new->thread = t; + new->thread = get_task_struct(t); /* * Tell the thread to set its affinity. This is * important for shared interrupt handlers as we do diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ebe8315a756a..1b66ccbb744a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1907,7 +1907,7 @@ int register_kretprobe(struct kretprobe *rp) /* Pre-allocate memory for max kretprobe instances */ if (rp->maxactive <= 0) { -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus()); #else rp->maxactive = num_possible_cpus(); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index fa83d36e30c6..2874bf556162 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -628,8 +628,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } /* [10] Grab the next task, i.e. owner of @lock */ - task = rt_mutex_owner(lock); - get_task_struct(task); + task = get_task_struct(rt_mutex_owner(lock)); raw_spin_lock(&task->pi_lock); /* @@ -709,8 +708,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } /* [10] Grab the next task, i.e. the owner of @lock */ - task = rt_mutex_owner(lock); - get_task_struct(task); + task = get_task_struct(rt_mutex_owner(lock)); raw_spin_lock(&task->pi_lock); /* [11] requeue the pi waiters if necessary */ diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 480edf328b51..7644eda17d62 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -7,7 +7,7 @@ menu "RCU Subsystem" config TREE_RCU bool - default y if !PREEMPT && SMP + default y if !PREEMPTION && SMP help This option selects the RCU implementation that is designed for very large SMP system with hundreds or @@ -16,7 +16,7 @@ config TREE_RCU config PREEMPT_RCU bool - default y if PREEMPT + default y if PREEMPTION help This option selects the RCU implementation that is designed for very large SMP systems with hundreds or @@ -28,7 +28,7 @@ config PREEMPT_RCU config TINY_RCU bool - default y if !PREEMPT && !SMP + default y if !PREEMPTION && !SMP help This option selects the RCU implementation that is designed for UP systems from which real-time response @@ -70,7 +70,7 @@ config TREE_SRCU This option selects the full-fledged version of SRCU. config TASKS_RCU - def_bool PREEMPT + def_bool PREEMPTION select SRCU help This option enables a task-based RCU implementation that uses diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 71395e91b876..81105141b6a8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1912,7 +1912,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) struct rcu_node *rnp_p; raw_lockdep_assert_held_rcu_node(rnp); - if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)) || + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) || WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || rnp->qsmask != 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2266,7 +2266,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { - if (!IS_ENABLED(CONFIG_PREEMPT) || + if (!IS_ENABLED(CONFIG_PREEMPTION) || rcu_preempt_blocked_readers_cgp(rnp)) { /* * No point in scanning bits because they @@ -2681,7 +2681,7 @@ static int rcu_blocking_is_gp(void) { int ret; - if (IS_ENABLED(CONFIG_PREEMPT)) + if (IS_ENABLED(CONFIG_PREEMPTION)) return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; might_sleep(); /* Check for RCU read-side critical section. */ preempt_disable(); @@ -3297,13 +3297,13 @@ static int __init rcu_spawn_gp_kthread(void) t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name); if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__)) return 0; - rnp = rcu_get_root(); - raw_spin_lock_irqsave_rcu_node(rnp, flags); - rcu_state.gp_kthread = t; if (kthread_prio) { sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); } + rnp = rcu_get_root(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rcu_state.gp_kthread = t; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); rcu_spawn_nocb_kthreads(); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 841ab43f3e60..c0b8c458d8a6 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -163,7 +163,7 @@ static void rcu_iw_handler(struct irq_work *iwp) // // Printing RCU CPU stall warnings -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* * Dump detailed information for all tasks blocking the current RCU @@ -215,7 +215,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) return ndetected; } -#else /* #ifdef CONFIG_PREEMPT */ +#else /* #ifdef CONFIG_PREEMPTION */ /* * Because preemptible RCU does not exist, we never have to check for @@ -233,7 +233,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) { return 0; } -#endif /* #else #ifdef CONFIG_PREEMPT */ +#endif /* #else #ifdef CONFIG_PREEMPTION */ /* * Dump stacks of all tasks running on stalled CPUs. First try using diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7fa8e74ad2ab..06961b997ed6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -773,6 +773,18 @@ static void set_load_weight(struct task_struct *p, bool update_load) } #ifdef CONFIG_UCLAMP_TASK +/* + * Serializes updates of utilization clamp values + * + * The (slow-path) user-space triggers utilization clamp value updates which + * can require updates on (fast-path) scheduler's data structures used to + * support enqueue/dequeue operations. + * While the per-CPU rq lock protects fast-path update operations, user-space + * requests are serialized using a mutex to reduce the risk of conflicting + * updates or API abuses. + */ +static DEFINE_MUTEX(uclamp_mutex); + /* Max allowed minimum utilization */ unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; @@ -798,7 +810,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); } -static inline unsigned int uclamp_none(int clamp_id) +static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id) { if (clamp_id == UCLAMP_MIN) return 0; @@ -814,7 +826,7 @@ static inline void uclamp_se_set(struct uclamp_se *uc_se, } static inline unsigned int -uclamp_idle_value(struct rq *rq, unsigned int clamp_id, +uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id, unsigned int clamp_value) { /* @@ -830,7 +842,7 @@ uclamp_idle_value(struct rq *rq, unsigned int clamp_id, return uclamp_none(UCLAMP_MIN); } -static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id, +static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id, unsigned int clamp_value) { /* Reset max-clamp retention only on idle exit */ @@ -841,8 +853,8 @@ static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id, } static inline -unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, - unsigned int clamp_value) +enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, + unsigned int clamp_value) { struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; int bucket_id = UCLAMP_BUCKETS - 1; @@ -861,16 +873,42 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, return uclamp_idle_value(rq, clamp_id, clamp_value); } +static inline struct uclamp_se +uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) +{ + struct uclamp_se uc_req = p->uclamp_req[clamp_id]; +#ifdef CONFIG_UCLAMP_TASK_GROUP + struct uclamp_se uc_max; + + /* + * Tasks in autogroups or root task group will be + * restricted by system defaults. + */ + if (task_group_is_autogroup(task_group(p))) + return uc_req; + if (task_group(p) == &root_task_group) + return uc_req; + + uc_max = task_group(p)->uclamp[clamp_id]; + if (uc_req.value > uc_max.value || !uc_req.user_defined) + return uc_max; +#endif + + return uc_req; +} + /* * The effective clamp bucket index of a task depends on, by increasing * priority: * - the task specific clamp value, when explicitly requested from userspace + * - the task group effective clamp value, for tasks not either in the root + * group or in an autogroup * - the system default clamp value, defined by the sysadmin */ static inline struct uclamp_se -uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) +uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id) { - struct uclamp_se uc_req = p->uclamp_req[clamp_id]; + struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id); struct uclamp_se uc_max = uclamp_default[clamp_id]; /* System default restrictions always apply */ @@ -880,7 +918,7 @@ uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) return uc_req; } -unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id) +enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) { struct uclamp_se uc_eff; @@ -904,7 +942,7 @@ unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id) * for each bucket when all its RUNNABLE tasks require the same clamp. */ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, - unsigned int clamp_id) + enum uclamp_id clamp_id) { struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; struct uclamp_se *uc_se = &p->uclamp[clamp_id]; @@ -942,7 +980,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, * enforce the expected state and warn. */ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, - unsigned int clamp_id) + enum uclamp_id clamp_id) { struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; struct uclamp_se *uc_se = &p->uclamp[clamp_id]; @@ -981,7 +1019,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { - unsigned int clamp_id; + enum uclamp_id clamp_id; if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -996,7 +1034,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { - unsigned int clamp_id; + enum uclamp_id clamp_id; if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1005,15 +1043,82 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) uclamp_rq_dec_id(rq, p, clamp_id); } +static inline void +uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) +{ + struct rq_flags rf; + struct rq *rq; + + /* + * Lock the task and the rq where the task is |
