aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-06-17 12:03:56 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2026-06-17 12:03:56 +0100
commit83476cc97bc635a3ff502bd194c79bfb1f1ae050 (patch)
treeefa273a93be9a4480b575a6c2d46e5c201b109e9
parentd4d9d39f046012ff330e81dcd9b1beadf3759f7e (diff)
parenta99ce697ea5e27b867c9ba4ee55fa5ba3b8d1188 (diff)
Merge tag 'cgroup-for-7.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: - Last cycle deferred css teardown on cgroup removal until the cgroup depopulated, so a css is not taken offline while tasks can still reference it. Disabling a controller through cgroup.subtree_control still had the same problem. This reworks the deferral from per-cgroup to per-css so that path is covered too. - New RDMA controller monitoring files: rdma.peak for per-device peak usage and rdma.events / rdma.events.local for resource-limit exhaustion. The max-limit parser was rewritten, fixing two input parsing bugs. - cpuset: fix a sched-domain leak on the domain-rebuild failure path and skip a redundant hardwall ancestor scan on v2. - Misc: pair the remaining lockless cgroup.max.* reads with WRITE_ONCE, assorted selftest robustness fixes, and doc path corrections. * tag 'cgroup-for-7.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (22 commits) cgroup: Migrate tasks to the root css when a controller is rebound docs: cgroup: Fix stale source file paths cgroup/cpuset: Free sched domains on rebuild guard failure cgroup: pair max limit READ_ONCE() with WRITE_ONCE() selftests/cgroup: enable memory controller in hugetlb memcg test cgroup/rdma: Drop unnecessary READ_ONCE() on event counters cgroup: Defer kill_css_finish() in cgroup_apply_control_disable() cgroup: Add per-subsys-css kill_css_finish deferral cgroup: Move populated counters to cgroup_subsys_state cgroup: Annotate unlocked nr_populated_* accesses with READ_ONCE/WRITE_ONCE cgroup: Inline cgroup_has_tasks() in cgroup.h cgroup/rdma: document rdma.peak, rdma.events and rdma.events.local cgroup/rdma: add rdma.events.local for per-cgroup allocation failure attribution cgroup/rdma: add rdma.events to track resource limit exhaustion cgroup/rdma: add rdma.peak for per-device peak usage tracking selftests/cgroup: check malloc return value in alloc_anon functions cgroup/cpuset: Skip hardwall ancestor scan in cpuset v2 in cpuset_current_node_allowed() selftests/cgroup: fix misleading debug message in test_cgfreezer_time_child selftests/cgroup: fix child process escaping to parent cleanup in test_cpucg_nice selftests/cgroup: Add NULL check after malloc in cgroup_util.c ...
-rw-r--r--Documentation/admin-guide/cgroup-v1/cgroups.rst2
-rw-r--r--Documentation/admin-guide/cgroup-v1/memcg_test.rst2
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst53
-rw-r--r--include/linux/cgroup-defs.h30
-rw-r--r--include/linux/cgroup.h27
-rw-r--r--include/linux/cgroup_rdma.h4
-rw-r--r--kernel/cgroup/cgroup.c222
-rw-r--r--kernel/cgroup/cpuset-v1.c2
-rw-r--r--kernel/cgroup/cpuset.c10
-rw-r--r--kernel/cgroup/rdma.c315
-rw-r--r--tools/testing/selftests/cgroup/lib/cgroup_util.c9
-rw-r--r--tools/testing/selftests/cgroup/test_cpu.c2
-rwxr-xr-xtools/testing/selftests/cgroup/test_cpuset_prs.sh2
-rw-r--r--tools/testing/selftests/cgroup/test_freezer.c2
-rw-r--r--tools/testing/selftests/cgroup/test_hugetlb_memcg.c8
-rw-r--r--tools/testing/selftests/cgroup/test_memcontrol.c53
16 files changed, 532 insertions, 211 deletions
diff --git a/Documentation/admin-guide/cgroup-v1/cgroups.rst b/Documentation/admin-guide/cgroup-v1/cgroups.rst
index 463f98453323..e501f45ea93f 100644
--- a/Documentation/admin-guide/cgroup-v1/cgroups.rst
+++ b/Documentation/admin-guide/cgroup-v1/cgroups.rst
@@ -525,7 +525,7 @@ cgroup. It may also be taken to prevent cgroups from being
modified, but more specific locks may be more appropriate in that
situation.
-See kernel/cgroup.c for more details.
+See kernel/cgroup/cgroup.c for more details.
Subsystems can take/release the cgroup_mutex via the functions
cgroup_lock()/cgroup_unlock().
diff --git a/Documentation/admin-guide/cgroup-v1/memcg_test.rst b/Documentation/admin-guide/cgroup-v1/memcg_test.rst
index 7c7cd457cf69..ebedbc3c3f9c 100644
--- a/Documentation/admin-guide/cgroup-v1/memcg_test.rst
+++ b/Documentation/admin-guide/cgroup-v1/memcg_test.rst
@@ -321,7 +321,7 @@ Under below explanation, we assume CONFIG_SWAP=y.
----------------------
Memory controller implements memory thresholds using cgroups notification
- API. You can use tools/cgroup/cgroup_event_listener.c to test it.
+ API. You can use samples/cgroup/cgroup_event_listener.c to test it.
(Shell-A) Create cgroup and run event listener::
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 6efd0095ed99..993446ab66d0 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2785,6 +2785,59 @@ RDMA Interface Files
mlx4_0 hca_handle=1 hca_object=20
ocrdma1 hca_handle=1 hca_object=23
+ rdma.peak
+ A read-only nested-keyed file that exists for all the cgroups
+ except root. It shows the historical high watermark of
+ resource usage per device since the cgroup was created.
+
+ An example for mlx4 and ocrdma device follows::
+
+ mlx4_0 hca_handle=1 hca_object=20
+ ocrdma1 hca_handle=0 hca_object=23
+
+ rdma.events
+ A read-only nested-keyed file which exists on non-root
+ cgroups. The following nested keys are defined.
+
+ max
+ The number of times a process in this cgroup or its
+ descendants attempted an RDMA resource allocation that
+ was rejected because a rdma.max limit in the subtree
+ was reached. This is a hierarchical counter: the event
+ is propagated upward to all ancestor cgroups. A value
+ change in this file generates a file modified event.
+
+ alloc_fail
+ The number of RDMA resource allocation attempts that
+ originated in this cgroup or its descendants and failed
+ due to a rdma.max limit being reached. This is a
+ hierarchical counter propagated upward.
+
+ An example for mlx4 device follows::
+
+ mlx4_0 hca_handle.max=5 hca_handle.alloc_fail=3 hca_object.max=0 hca_object.alloc_fail=0
+
+ rdma.events.local
+ Similar to rdma.events but the fields in the file are local
+ to the cgroup i.e. not hierarchical. The file modified event
+ generated on this file reflects only the local events.
+
+ The following nested keys are defined.
+
+ max
+ The number of times a process in this cgroup or its
+ descendants attempted an RDMA resource allocation that
+ was rejected because this cgroup's own rdma.max limit
+ was reached.
+ alloc_fail
+ The number of RDMA resource allocation attempts
+ originating from this cgroup that failed due to this
+ cgroup's or an ancestor's rdma.max limit.
+
+ An example for mlx4 device follows::
+
+ mlx4_0 hca_handle.max=5 hca_handle.alloc_fail=0 hca_object.max=0 hca_object.alloc_fail=0
+
DMEM
----
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 50a784da7a81..de2cd6238c2a 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -254,6 +254,18 @@ struct cgroup_subsys_state {
int nr_descendants;
/*
+ * Hierarchical populated state. For cgroup->self, nr_populated_csets
+ * counts populated csets linked via cgrp_cset_link.
+ * nr_populated_children counts immediate-child csses whose own
+ * populated state is nonzero. Protected by css_set_lock.
+ */
+ int nr_populated_csets;
+ int nr_populated_children;
+
+ /* deferred kill_css_finish() queued by css_update_populated() */
+ struct work_struct kill_finish_work;
+
+ /*
* A singly-linked list of css structures to be rstat flushed.
* This is a scratch field to be used exclusively by
* css_rstat_flush().
@@ -504,17 +516,12 @@ struct cgroup {
int max_descendants;
/*
- * Each non-empty css_set associated with this cgroup contributes
- * one to nr_populated_csets. The counter is zero iff this cgroup
- * doesn't have any tasks.
- *
- * All children which have non-zero nr_populated_csets and/or
- * nr_populated_children of their own contribute one to either
- * nr_populated_domain_children or nr_populated_threaded_children
- * depending on their type. Each counter is zero iff all cgroups
- * of the type in the subtree proper don't have any tasks.
+ * Domain/threaded split of self.nr_populated_children: each counts
+ * immediate-child cgroups whose subtree is populated and sums to
+ * self.nr_populated_children. Kept as separate fields to allow readers
+ * like cgroup_can_be_thread_root() unlocked access. Protected by
+ * css_set_lock; updated by css_update_populated().
*/
- int nr_populated_csets;
int nr_populated_domain_children;
int nr_populated_threaded_children;
@@ -611,9 +618,6 @@ struct cgroup {
/* used to wait for offlining of csses */
wait_queue_head_t offline_waitq;
- /* defers killing csses after removal until cgroup is depopulated */
- struct work_struct finish_destroy_work;
-
/* used to schedule release agent */
struct work_struct release_agent_work;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c5648fcf74e2..f2aa46a4f871 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -640,11 +640,32 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
return cgroup_is_descendant(cset->dfl_cgrp, ancestor);
}
-/* no synchronization, the result can only be used as a hint */
+/*
+ * Populated counters: writes happen under css_set_lock. The accessors below
+ * may read unlocked. What an unpopulated result means depends on context:
+ *
+ * - No lock held. Just a snapshot. May race with concurrent updates and is
+ * useful only as a hint.
+ *
+ * - cgroup_mutex held. Migration into the cgroup is blocked, so an observed
+ * !populated stays !populated until cgroup_mutex is dropped.
+ *
+ * - CSS_DYING set. The css can no longer be repopulated, so !populated is
+ * sticky once observed.
+ */
+static inline bool cgroup_has_tasks(struct cgroup *cgrp)
+{
+ return READ_ONCE(cgrp->self.nr_populated_csets);
+}
+
+static inline bool css_is_populated(struct cgroup_subsys_state *css)
+{
+ return READ_ONCE(css->nr_populated_csets) || READ_ONCE(css->nr_populated_children);
+}
+
static inline bool cgroup_is_populated(struct cgroup *cgrp)
{
- return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children +
- cgrp->nr_populated_threaded_children;
+ return css_is_populated(&cgrp->self);
}
/* returns ino associated with a cgroup */
diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h
index 80edae03c313..404e746552ca 100644
--- a/include/linux/cgroup_rdma.h
+++ b/include/linux/cgroup_rdma.h
@@ -24,6 +24,10 @@ struct rdma_cgroup {
* that belongs to this cgroup.
*/
struct list_head rpools;
+
+ /* Handles for rdma.events[.local] */
+ struct cgroup_file events_file;
+ struct cgroup_file events_local_file;
};
struct rdmacg_device {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 6152add0c5eb..38f8d9df8fbc 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -197,6 +197,14 @@ static u32 cgrp_dfl_implicit_ss_mask;
/* some controllers can be threaded on the default hierarchy */
static u32 cgrp_dfl_threaded_ss_mask;
+/*
+ * Set across rebind_subsystems() to the controllers leaving a hierarchy.
+ * Guarded by cgroup_mutex. Makes find_existing_css_set() resolve them to the
+ * root css so the affected tasks are migrated there before
+ * cgroup_apply_control_disable() kills the per-cgroup csses.
+ */
+static u32 cgroup_rebind_ss_mask;
+
/* The list of hierarchy roots */
LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
@@ -264,7 +272,6 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_skip(struct css_task_iter *it,
struct task_struct *task);
static int cgroup_destroy_locked(struct cgroup *cgrp);
-static void cgroup_finish_destroy(struct cgroup *cgrp);
static void kill_css_sync(struct cgroup_subsys_state *css);
static void kill_css_finish(struct cgroup_subsys_state *css);
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
@@ -376,11 +383,6 @@ static void cgroup_idr_remove(struct idr *idr, int id)
spin_unlock_bh(&cgroup_idr_lock);
}
-static bool cgroup_has_tasks(struct cgroup *cgrp)
-{
- return cgrp->nr_populated_csets;
-}
-
static bool cgroup_is_threaded(struct cgroup *cgrp)
{
return cgrp->dom_cgrp != cgrp;
@@ -409,7 +411,7 @@ static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
return false;
/* can only have either domain or threaded children */
- if (cgrp->nr_populated_domain_children)
+ if (READ_ONCE(cgrp->nr_populated_domain_children))
return false;
/* and no domain controllers can be enabled */
@@ -761,62 +763,76 @@ static bool css_set_populated(struct css_set *cset)
}
/**
- * cgroup_update_populated - update the populated count of a cgroup
- * @cgrp: the target cgroup
- * @populated: inc or dec populated count
- *
- * One of the css_sets associated with @cgrp is either getting its first
- * task or losing the last. Update @cgrp->nr_populated_* accordingly. The
- * count is propagated towards root so that a given cgroup's
- * nr_populated_children is zero iff none of its descendants contain any
- * tasks.
- *
- * @cgrp's interface file "cgroup.populated" is zero if both
- * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
- * 1 otherwise. When the sum changes from or to zero, userland is notified
- * that the content of the interface file has changed. This can be used to
- * detect when @cgrp and its descendants become populated or empty.
+ * css_update_populated - update the populated state of a css and ancestors
+ * @css: leaf css whose own populated count is changing
+ * @populated: inc or dec
+ *
+ * One of the css_sets pinned by @css is getting its first task or losing the
+ * last. Propagate the transition up the parent chain so that a css's
+ * nr_populated_children is zero iff none of its descendants contain any tasks.
+ *
+ * For a cgroup->self walk, also runs cgroup-side bookkeeping at each level:
+ * domain/threaded child split, deferred-destroy trigger, and notification via
+ * "cgroup.populated" (zero iff cgrp->self has neither populated csets nor
+ * populated children; userland is notified on transitions).
*/
-static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
+static void css_update_populated(struct cgroup_subsys_state *css, bool populated)
{
- struct cgroup *child = NULL;
+ struct cgroup_subsys_state *child = NULL;
int adj = populated ? 1 : -1;
lockdep_assert_held(&css_set_lock);
do {
- bool was_populated = cgroup_is_populated(cgrp);
+ /* non-NULL only on the cgroup->self walk */
+ struct cgroup *cgrp = css_is_self(css) ? css->cgroup : NULL;
+ bool was_populated = css_is_populated(css);
if (!child) {
- cgrp->nr_populated_csets += adj;
+ WRITE_ONCE(css->nr_populated_csets,
+ css->nr_populated_csets + adj);
} else {
- if (cgroup_is_threaded(child))
- cgrp->nr_populated_threaded_children += adj;
- else
- cgrp->nr_populated_domain_children += adj;
+ WRITE_ONCE(css->nr_populated_children,
+ css->nr_populated_children + adj);
+ if (cgrp) {
+ if (cgroup_is_threaded(child->cgroup))
+ WRITE_ONCE(cgrp->nr_populated_threaded_children,
+ cgrp->nr_populated_threaded_children + adj);
+ else
+ WRITE_ONCE(cgrp->nr_populated_domain_children,
+ cgrp->nr_populated_domain_children + adj);
+ }
}
- if (was_populated == cgroup_is_populated(cgrp))
+ if (was_populated == css_is_populated(css))
break;
/*
- * Subtree just emptied below an offlined cgrp. Fire deferred
- * destroy. The transition is one-shot.
+ * Pair with smp_mb() in kill_css_sync(). Either we observe
+ * CSS_DYING and queue, or the caller observes our decrement
+ * and fires synchronously.
+ */
+ smp_mb();
+
+ /*
+ * Subtree just emptied below a dying css. Fire deferred kill.
+ * The transition is one-shot for a dying css.
*/
- if (was_populated && !css_is_online(&cgrp->self)) {
- cgroup_get(cgrp);
- WARN_ON_ONCE(!queue_work(cgroup_offline_wq,
- &cgrp->finish_destroy_work));
+ if (was_populated && css_is_dying(css)) {
+ css_get(css);
+ WARN_ON_ONCE(!queue_work(cgroup_offline_wq, &css->kill_finish_work));
}
- cgroup1_check_for_release(cgrp);
- TRACE_CGROUP_PATH(notify_populated, cgrp,
- cgroup_is_populated(cgrp));
- cgroup_file_notify(&cgrp->events_file);
+ if (cgrp) {
+ cgroup1_check_for_release(cgrp);
+ TRACE_CGROUP_PATH(notify_populated, cgrp,
+ cgroup_is_populated(cgrp));
+ cgroup_file_notify(&cgrp->events_file);
+ }
- child = cgrp;
- cgrp = cgroup_parent(cgrp);
- } while (cgrp);
+ child = css;
+ css = css->parent;
+ } while (css);
}
/**
@@ -824,17 +840,27 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
* @cset: target css_set
* @populated: whether @cset is populated or depopulated
*
- * @cset is either getting the first task or losing the last. Update the
- * populated counters of all associated cgroups accordingly.
+ * @cset is either getting the first task or losing the last. Update the
+ * populated counters along each linked cgroup's self chain and each
+ * subsystem css that @cset pins.
*/
static void css_set_update_populated(struct css_set *cset, bool populated)
{
struct cgrp_cset_link *link;
+ struct cgroup_subsys *ss;
+ int ssid;
lockdep_assert_held(&css_set_lock);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
- cgroup_update_populated(link->cgrp, populated);
+ css_update_populated(&link->cgrp->self, populated);
+
+ for_each_subsys(ss, ssid) {
+ struct cgroup_subsys_state *css = cset->subsys[ssid];
+
+ if (css)
+ css_update_populated(css, populated);
+ }
}
/*
@@ -1065,7 +1091,15 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
* won't change, so no need for locking.
*/
for_each_subsys(ss, i) {
- if (root->subsys_mask & (1UL << i)) {
+ if (unlikely(cgroup_rebind_ss_mask & (1UL << i))) {
+ /*
+ * @ss is leaving this hierarchy and its per-cgroup
+ * csses are about to be killed. Resolve to the
+ * surviving root css so the tasks are migrated there.
+ */
+ template[i] = cgroup_css(&root->cgrp, ss);
+ WARN_ON_ONCE(!template[i]);
+ } else if (root->subsys_mask & (1UL << i)) {
/*
* @ss is in this hierarchy, so we want the
* effective css from @cgrp.
@@ -1835,11 +1869,17 @@ int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask)
struct cgroup *scgrp = &cgrp_dfl_root.cgrp;
/*
- * Controllers from default hierarchy that need to be rebound
- * are all disabled together in one go.
+ * Controllers leaving the default hierarchy are disabled
+ * together. cgroup_rebind_ss_mask makes cgroup_apply_control()
+ * migrate their tasks to the root css, so the per-cgroup csses
+ * are unpopulated when cgroup_finalize_control() kills them.
+ * Clear it before cgroup_finalize_control(), which does no
+ * css_set lookup.
*/
cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
+ cgroup_rebind_ss_mask = dfl_disable_ss_mask;
WARN_ON(cgroup_apply_control(scgrp));
+ cgroup_rebind_ss_mask = 0;
cgroup_finalize_control(scgrp, 0);
}
@@ -1853,9 +1893,14 @@ int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask)
WARN_ON(!css || cgroup_css(dcgrp, ss));
if (src_root != &cgrp_dfl_root) {
- /* disable from the source */
+ /*
+ * Disable from the source, migrating its tasks to the
+ * root css first (see cgroup_rebind_ss_mask).
+ */
src_root->subsys_mask &= ~(1 << ssid);
+ cgroup_rebind_ss_mask = 1 << ssid;
WARN_ON(cgroup_apply_control(scgrp));
+ cgroup_rebind_ss_mask = 0;
cgroup_finalize_control(scgrp, 0);
}
@@ -2051,16 +2096,6 @@ static int cgroup_reconfigure(struct fs_context *fc)
return 0;
}
-static void cgroup_finish_destroy_work_fn(struct work_struct *work)
-{
- struct cgroup *cgrp = container_of(work, struct cgroup, finish_destroy_work);
-
- cgroup_lock();
- cgroup_finish_destroy(cgrp);
- cgroup_unlock();
- cgroup_put(cgrp);
-}
-
static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
@@ -2087,7 +2122,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
#endif
init_waitqueue_head(&cgrp->offline_waitq);
- INIT_WORK(&cgrp->finish_destroy_work, cgroup_finish_destroy_work_fn);
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
@@ -2192,7 +2226,7 @@ int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask)
hash_for_each(css_set_table, i, cset, hlist) {
link_css_set(&tmp_links, cset, root_cgrp);
if (css_set_populated(cset))
- cgroup_update_populated(root_cgrp, true);
+ css_update_populated(&root_cgrp->self, true);
}
spin_unlock_irq(&css_set_lock);
@@ -3230,7 +3264,7 @@ restart:
struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
DEFINE_WAIT(wait);
- if (!css || !percpu_ref_is_dying(&css->refcnt))
+ if (!css || !css_is_dying(css))
continue;
cgroup_get_live(dsct);
@@ -3398,7 +3432,8 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp)
if (css->parent &&
!(cgroup_ss_mask(dsct) & (1 << ss->id))) {
kill_css_sync(css);
- kill_css_finish(css);
+ if (!css_is_populated(css))
+ kill_css_finish(css);
} else if (!css_visible(css)) {
css_clear_dir(css);
if (ss->css_reset)
@@ -3726,7 +3761,7 @@ static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
if (!cgrp)
return -ENOENT;
- cgrp->max_descendants = descendants;
+ WRITE_ONCE(cgrp->max_descendants, descendants);
cgroup_kn_unlock(of->kn);
@@ -3769,7 +3804,7 @@ static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
if (!cgrp)
return -ENOENT;
- cgrp->max_depth = depth;
+ WRITE_ONCE(cgrp->max_depth, depth);
cgroup_kn_unlock(of->kn);
@@ -5684,6 +5719,22 @@ static void css_release(struct percpu_ref *ref)
queue_work(cgroup_release_wq, &css->destroy_work);
}
+/*
+ * Deferred kill_css_finish() fired from css_update_populated() once a dying
+ * css's hierarchical populated state drops to zero. Pinned by css_get() at the
+ * queue site; matched by css_put() here.
+ */
+static void kill_css_finish_work_fn(struct work_struct *work)
+{
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, kill_finish_work);
+
+ cgroup_lock();
+ kill_css_finish(css);
+ cgroup_unlock();
+ css_put(css);
+}
+
static void init_and_link_css(struct cgroup_subsys_state *css,
struct cgroup_subsys *ss, struct cgroup *cgrp)
{
@@ -5697,6 +5748,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
css->id = -1;
INIT_LIST_HEAD(&css->sibling);
INIT_LIST_HEAD(&css->children);
+ INIT_WORK(&css->kill_finish_work, kill_css_finish_work_fn);
css->serial_nr = css_serial_nr_next++;
atomic_set(&css->online_cnt, 0);
@@ -6074,6 +6126,13 @@ static void kill_css_sync(struct cgroup_subsys_state *css)
css->flags |= CSS_DYING;
/*
+ * Pair with smp_mb() in css_update_populated(). Either our
+ * caller observes the walker's decrement and fires
+ * synchronously, or the walker observes CSS_DYING and queues.
+ */
+ smp_mb();
+
+ /*
* This must happen before css is disassociated with its cgroup.
* See seq_css() for details.
*/
@@ -6148,9 +6207,9 @@ static void kill_css_finish(struct cgroup_subsys_state *css)
* - This function: synchronous user-visible state teardown plus kill_css_sync()
* on each subsystem css.
*
- * - cgroup_finish_destroy(): kicks the percpu_ref kill via kill_css_finish() on
- * each subsystem css. Fires once @cgrp's subtree is fully drained, either
- * inline here or from cgroup_update_populated().
+ * - For each subsys css: fire kill_css_finish() synchronously if the subtree is
+ * already drained, otherwise rely on css_update_populated() to queue
+ * kill_finish_work when the last populated cset under the css empties.
*
* - The percpu_ref kill chain: css_killed_ref_fn -> css_killed_work_fn ->
* ->css_offline() -> release/free.
@@ -6228,29 +6287,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
- if (!cgroup_is_populated(cgrp))
- cgroup_finish_destroy(cgrp);
+ for_each_css(css, ssid, cgrp) {
+ if (!css_is_populated(css))
+ kill_css_finish(css);
+ }
return 0;
};
-/**
- * cgroup_finish_destroy - deferred half of @cgrp destruction
- * @cgrp: cgroup whose subtree just became empty
- *
- * See cgroup_destroy_locked() for the rationale.
- */
-static void cgroup_finish_destroy(struct cgroup *cgrp)
-{
- struct cgroup_subsys_state *css;
- int ssid;
-
- lockdep_assert_held(&cgroup_mutex);
-
- for_each_css(css, ssid, cgrp)
- kill_css_finish(css);
-}
-
int cgroup_rmdir(struct kernfs_node *kn)
{
struct cgroup *cgrp;
diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
index 7308e9b02495..3e9968dd91e9 100644
--- a/kernel/cgroup/cpuset-v1.c
+++ b/kernel/cgroup/cpuset-v1.c
@@ -312,7 +312,7 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs,
* This is full cgroup operation which will also call back into
* cpuset. Execute it asynchronously using workqueue.
*/
- if (is_empty && cs->css.cgroup->nr_populated_csets &&
+ if (is_empty && cgroup_has_tasks(cs->css.cgroup) &&
css_tryget_online(&cs->css)) {
struct cpuset_remove_tasks_struct *s;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index c9e14fda3d6f..591e3aa487fc 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -432,7 +432,7 @@ static inline bool partition_is_populated(struct cpuset *cs,
* nr_populated_domain_children may include populated
* csets from descendants that are partitions.
*/
- if (cs->css.cgroup->nr_populated_csets ||
+ if (cgroup_has_tasks(cs->css.cgroup) ||
cs->attach_in_progress)
return true;
@@ -1004,8 +1004,11 @@ void rebuild_sched_domains_locked(void)
* prevent the panic.
*/
for (i = 0; doms && i < ndoms; i++) {
- if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask)))
+ if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask))) {
+ free_sched_domains(doms, ndoms);
+ kfree(attr);
return;
+ }
}
/* Have scheduler rebuild the domains */
@@ -4236,6 +4239,9 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
return false;
+ if (cpuset_v2())
+ return true;
+
/* Not hardwall and node outside mems_allowed: scan up cpusets */
spin_lock_irqsave(&callback_lock, flags);
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index 4fdab4cf49e0..5e82a03b3270 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -9,6 +9,7 @@
*/
#include <linux/bitops.h>
+#include <linux/limits.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/cgroup.h>
@@ -17,6 +18,22 @@
#define RDMACG_MAX_STR "max"
+enum rdmacg_limit_tokens {
+ RDMACG_HCA_HANDLE_VAL,
+ RDMACG_HCA_HANDLE_MAX,
+ RDMACG_HCA_OBJECT_VAL,
+ RDMACG_HCA_OBJECT_MAX,
+ NR_RDMACG_LIMIT_TOKENS,
+};
+
+static const match_table_t rdmacg_limit_tokens = {
+ { RDMACG_HCA_HANDLE_VAL, "hca_handle=%d" },
+ { RDMACG_HCA_HANDLE_MAX, "hca_handle=max" },
+ { RDMACG_HCA_OBJECT_VAL, "hca_object=%d" },
+ { RDMACG_HCA_OBJECT_MAX, "hca_object=max" },
+ { NR_RDMACG_LIMIT_TOKENS, NULL },
+};
+
/*
* Protects list of resource pools maintained on per cgroup basis
* and rdma device list.
@@ -27,6 +44,7 @@ static LIST_HEAD(rdmacg_devices);
enum rdmacg_file_type {
RDMACG_RESOURCE_TYPE_MAX,
RDMACG_RESOURCE_TYPE_STAT,
+ RDMACG_RESOURCE_TYPE_PEAK,
};
/*
@@ -43,6 +61,7 @@ static char const *rdmacg_resource_names[] = {
struct rdmacg_resource {
int max;
int usage;
+ int peak;
};
/*
@@ -62,6 +81,12 @@ struct rdmacg_resource_pool {
u64 usage_sum;
/* total number counts which are set to max */
int num_max_cnt;
+
+ /* per-resource event counters */
+ u64 events_max[RDMACG_RESOURCE_MAX];
+ u64 events_alloc_fail[RDMACG_RESOURCE_MAX];
+ u64 events_local_max[RDMACG_RESOURCE_MAX];
+ u64 events_local_alloc_fail[RDMACG_RESOURCE_MAX];
};
static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
@@ -109,6 +134,26 @@ static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
kfree(rpool);
}
+static bool rpool_has_persistent_state(struct rdmacg_resource_pool *rpool)
+{
+ int i;
+
+ /*
+ * Keep the rpool alive if any peak value is non-zero,
+ * so that rdma.peak persists as a historical high-
+ * watermark even after all resources are freed.
+ */
+ for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+ if (rpool->resources[i].peak ||
+ rpool->events_max[i] ||
+ rpool->events_local_max[i] ||
+ rpool->events_alloc_fail[i] ||
+ rpool->events_local_alloc_fail[i])
+ return true;
+ }
+ return false;
+}
+
static struct rdmacg_resource_pool *
find_cg_rpool_locked(struct rdma_cgroup *cg,
struct rdmacg_device *device)
@@ -187,11 +232,67 @@ uncharge_cg_locked(struct rdma_cgroup *cg,
rpool->usage_sum--;
if (rpool->usage_sum == 0 &&
rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
- /*
- * No user of the rpool and all entries are set to max, so
- * safe to delete this rpool.
- */
- free_cg_rpool_locked(rpool);
+ if (!rpool_has_persistent_state(rpool)) {
+ /*
+ * No user of the rpool and all entries are set to max, so
+ * safe to delete this rpool.
+ */
+ free_cg_rpool_locked(rpool);
+ }
+ }
+}
+
+/**
+ * rdmacg_event_locked - fire event when resource allocation exceeds limit
+ * @cg: requesting cgroup
+ * @over_cg: cgroup whose limit was exceeded
+ * @device: rdma device
+ * @index: resource type index
+ *
+ * Must be called under rdmacg_mutex. Updates event counters in the
+ * resource pools of @cg and @over_cg, propagates hierarchical max
+ * events from @over_cg (including itself) upward, and notifies
+ * userspace via cgroup_file_notify().
+ */
+static void rdmacg_event_locked(struct rdma_cgroup *cg,
+ struct rdma_cgroup *over_cg,
+ struct rdmacg_device *device,
+ enum rdmacg_resource_type index)
+{
+ struct rdmacg_resource_pool *rpool;
+ struct rdma_cgroup *p;
+
+ lockdep_assert_held(&rdmacg_mutex);
+
+ /* Increment local alloc_fail in requesting cgroup */
+ rpool = find_cg_rpool_locked(cg, device);
+ if (rpool) {
+ rpool->events_local_alloc_fail[index]++;
+ cgroup_file_notify(&cg->events_local_file);
+ }
+
+ /* Increment local max in the over-limit cgroup */
+ rpool = find_cg_rpool_locked(over_cg, device);
+ if (rpool) {
+ rpool->events_local_max[index]++;
+ cgroup_file_notify(&over_cg->events_local_file);
+ }
+
+ /* Propagate hierarchical max events upward */
+ for (p = over_cg; parent_rdmacg(p); p = parent_rdmacg(p)) {
+ rpool = get_cg_rpool_locked(p, device);
+ if (!IS_ERR(rpool)) {
+ rpool->events_max[index]++;
+ cgroup_file_notify(&p->events_file);
+ }
+ }
+ /* Propagate hierarchical alloc_fail from requesting cgroup upward */
+ for (p = cg; parent_rdmacg(p); p = parent_rdmacg(p)) {
+ rpool = get_cg_rpool_locked(p, device);
+ if (!IS_ERR(rpool)) {
+ rpool->events_alloc_fail[index]++;
+ cgroup_file_notify(&p->events_file);
+ }
}
}
@@ -293,12 +394,20 @@ int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
}
}
}
+ /* Update peak only after all charges succeed */
+ for (p = cg; p; p = parent_rdmacg(p)) {
+ rpool = find_cg_rpool_locked(p, device);
+ if (rpool && rpool->resources[index].usage > rpool->resources[index].peak)
+ rpool->resources[index].peak = rpool->resources[index].usage;
+ }
mutex_unlock(&rdmacg_mutex);
*rdmacg = cg;
return 0;
err:
+ if (ret == -EAGAIN)
+