aboutsummaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorChristian Brauner <brauner@kernel.org>2026-05-21 08:46:22 +0200
committerChristian Brauner <brauner@kernel.org>2026-05-26 11:02:02 +0200
commit4425cd76b5e73ce92bea9dc61a0027ef3d55c9f0 (patch)
tree4a0cefcb5798c8b96c2481271e3edd2ea9d08558 /include/linux
parent5200f5f493f79f14bbdc349e402a40dfb32f23c8 (diff)
parent6b1c66c9cca99bf00386481c7b2aa7394c26d8b8 (diff)
Merge patch series "exec: introduce task_exec_state for exec-time metadata"
Christian Brauner (Amutable) <brauner@kernel.org> says: This series relocates the dumpable mode and the user_namespace captured at execve() from mm_struct onto a new per-task task_exec_state structure that stays attached to the task for its full lifetime. __ptrace_may_access() and several /proc owner / visibility checks need to consult two pieces of state for any observable task, including zombies that have already gone through exit_mm(): the dumpable mode and the user namespace captured at execve(). Both live on mm_struct today, which exit_mm() clears from the task long before the task is reaped. A reader that races with do_exit() observes task->mm == NULL and either fails the check or falls back to init_user_ns - which denies legitimate access to non-dumpable zombies that were running in a nested user namespace. mm_struct loses ->user_ns and the dumpability bits in ->flags. MMF_DUMPABLE_BITS is reserved so MMF_DUMP_FILTER_* layout exposed via /proc/<pid>/coredump_filter stays stable. task->user_dumpable and its exit_mm() snapshot are removed. task_exec_state is the privilege domain established by an execve() [1]. Within a thread group it is shared via refcount; across thread groups each task has its own: - CLONE_VM siblings (thread-group members, io_uring workers) refcount-share the parent's exec_state. - Non-CLONE_VM clones (fork(), vfork() without CLONE_VM) allocate a fresh exec_state inheriting the parent's dumpable mode and user_ns. - execve() in the child allocates a fresh instance and installs it under task_lock + exec_update_lock via task_exec_state_replace(). - Credential changes (setresuid, capset, ...) and prctl(PR_SET_DUMPABLE) update dumpability on the current task's exec_state, i.e. on the thread group's shared instance. Behavioral change: Kernel threads that briefly use a user mm via kthread_use_mm() no longer inherit dumpability from the borrowed mm. Kthreads are not ptraceable (PF_KTHREAD short-circuits __ptrace_may_access), so this is observable only via /proc surfaces that a sufficiently privileged reader can reach. [1] https://lore.kernel.org/r/CAHk-=wj+NgoDH3GSicJ140SV8OoDd71pLmL3fgFEsTcgoMC6Og@mail.gmail.com * patches from https://patch.msgid.link/20260520-work-task_exec_state-v3-0-69f895bc1385@kernel.org: exec_state: relocate dumpable information ptrace: add ptracer_access_allowed() exec: introduce struct task_exec_state sched/coredump: introduce enum task_dumpable Link: https://patch.msgid.link/20260520-work-task_exec_state-v3-0-69f895bc1385@kernel.org Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/binfmts.h2
-rw-r--r--include/linux/coredump.h4
-rw-r--r--include/linux/mm_types.h9
-rw-r--r--include/linux/ptrace.h1
-rw-r--r--include/linux/sched.h6
-rw-r--r--include/linux/sched/coredump.h47
-rw-r--r--include/linux/sched/exec_state.h31
7 files changed, 56 insertions, 44 deletions
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 65abd5ab8836..a8379f4eee61 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -25,6 +25,8 @@ struct linux_binprm {
struct page *page[MAX_ARG_PAGES];
#endif
struct mm_struct *mm;
+ /* user_ns published to task->exec_state at execve, narrowed by would_dump(). */
+ struct user_namespace *user_ns;
unsigned long p; /* current top of mem */
unsigned int
/* Should an execfd be passed to userspace? */
diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index 68861da4cf7c..7b38ee2e7913 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/fs.h>
+#include <linux/sched/coredump.h>
#include <asm/siginfo.h>
#ifdef CONFIG_COREDUMP
@@ -20,7 +21,10 @@ struct coredump_params {
const kernel_siginfo_t *siginfo;
struct file *file;
unsigned long limit;
+ /* MMF_DUMP_FILTER_* bits, snapshot of mm->flags at dump start. */
unsigned long mm_flags;
+ /* Snapshot of dumpable at dump start. */
+ enum task_dumpable dumpable;
int cpu;
loff_t written;
loff_t pos;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index a308e2c23b82..9588ce3b16df 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1342,7 +1342,6 @@ struct mm_struct {
*/
struct task_struct __rcu *owner;
#endif
- struct user_namespace *user_ns;
/* store ref to file /proc/<pid>/exe symlink points to */
struct file __rcu *exe_file;
@@ -1907,11 +1906,11 @@ enum {
/* mm flags */
/*
- * The first two bits represent core dump modes for set-user-ID,
- * the modes are SUID_DUMP_* defined in linux/sched/coredump.h
+ * Bits 0 and 1 were dumpability; that moved to task->exec_state. Reserve
+ * the bits so MMF_DUMP_FILTER_* positions stay stable for the
+ * /proc/<pid>/coredump_filter ABI.
*/
#define MMF_DUMPABLE_BITS 2
-#define MMF_DUMPABLE_MASK (BIT(MMF_DUMPABLE_BITS) - 1)
/* coredump filter bits */
#define MMF_DUMP_ANON_PRIVATE 2
#define MMF_DUMP_ANON_SHARED 3
@@ -1972,7 +1971,7 @@ enum {
#define MMF_TOPDOWN 31 /* mm searches top down by default */
#define MMF_TOPDOWN_MASK BIT(MMF_TOPDOWN)
-#define MMF_INIT_LEGACY_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
+#define MMF_INIT_LEGACY_MASK (MMF_DUMP_FILTER_MASK |\
MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK)
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 90507d4afcd6..ef314f7a9ecc 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -17,6 +17,7 @@ struct syscall_info {
struct seccomp_data data;
};
+bool ptracer_access_allowed(struct task_struct *tsk);
extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
void *buf, int len, unsigned int gup_flags);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ee06cba5c6f5..258cb075478d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -85,6 +85,7 @@ struct seq_file;
struct sighand_struct;
struct signal_struct;
struct task_delay_info;
+struct task_exec_state;
struct task_group;
struct task_struct;
struct timespec64;
@@ -962,6 +963,8 @@ struct task_struct {
struct mm_struct *mm;
struct mm_struct *active_mm;
+ struct task_exec_state __rcu *exec_state;
+
int exit_state;
int exit_code;
int exit_signal;
@@ -1002,9 +1005,6 @@ struct task_struct {
unsigned sched_rt_mutex:1;
#endif
- /* Save user-dumpable when mm goes away */
- unsigned user_dumpable:1;
-
/* Bit to tell TOMOYO we're in execve(): */
unsigned in_execve:1;
unsigned in_iowait:1;
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 624fda17a785..20957ccde3b5 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -2,43 +2,18 @@
#ifndef _LINUX_SCHED_COREDUMP_H
#define _LINUX_SCHED_COREDUMP_H
-#include <linux/mm_types.h>
-
-#define SUID_DUMP_DISABLE 0 /* No setuid dumping */
-#define SUID_DUMP_USER 1 /* Dump as user of process */
-#define SUID_DUMP_ROOT 2 /* Dump as root */
-
-static inline unsigned long __mm_flags_get_dumpable(const struct mm_struct *mm)
-{
- /*
- * By convention, dumpable bits are contained in first 32 bits of the
- * bitmap, so we can simply access this first unsigned long directly.
- */
- return __mm_flags_get_word(mm);
-}
-
-static inline void __mm_flags_set_mask_dumpable(struct mm_struct *mm, int value)
-{
- __mm_flags_set_mask_bits_word(mm, MMF_DUMPABLE_MASK, value);
-}
-
-extern void set_dumpable(struct mm_struct *mm, int value);
/*
- * This returns the actual value of the suid_dumpable flag. For things
- * that are using this for checking for privilege transitions, it must
- * test against SUID_DUMP_USER rather than treating it as a boolean
- * value.
+ * Task dumpability mode. Gates core dump production and ptrace_attach()
+ * authorization. The numeric values are stable ABI (suid_dumpable
+ * sysctl, prctl(PR_SET_DUMPABLE)); do not renumber.
*/
-static inline int __get_dumpable(unsigned long mm_flags)
-{
- return mm_flags & MMF_DUMPABLE_MASK;
-}
-
-static inline int get_dumpable(struct mm_struct *mm)
-{
- unsigned long flags = __mm_flags_get_dumpable(mm);
-
- return __get_dumpable(flags);
-}
+enum task_dumpable {
+ TASK_DUMPABLE_OFF = 0, /* no dump; ptrace needs CAP_SYS_PTRACE */
+ TASK_DUMPABLE_OWNER = 1, /* default; dump and ptrace by uid match */
+ TASK_DUMPABLE_ROOT = 2, /* dump as root; ptrace needs CAP_SYS_PTRACE */
+};
+
+void task_exec_state_set_dumpable(enum task_dumpable value);
+enum task_dumpable task_exec_state_get_dumpable(struct task_struct *task);
#endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/include/linux/sched/exec_state.h b/include/linux/sched/exec_state.h
new file mode 100644
index 000000000000..9b61782510b8
--- /dev/null
+++ b/include/linux/sched/exec_state.h
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */
+#ifndef _LINUX_SCHED_EXEC_STATE_H
+#define _LINUX_SCHED_EXEC_STATE_H
+
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+#include <linux/refcount.h>
+#include <linux/sched/coredump.h>
+#include <linux/user_namespace.h>
+
+struct task_exec_state {
+ refcount_t count;
+ enum task_dumpable dumpable;
+ struct user_namespace *user_ns;
+ struct rcu_head rcu;
+};
+
+extern struct task_exec_state init_task_exec_state;
+
+struct task_exec_state *alloc_task_exec_state(struct user_namespace *user_ns);
+void put_task_exec_state(struct task_exec_state *exec_state);
+struct task_exec_state *task_exec_state_rcu(const struct task_struct *tsk);
+struct task_exec_state *task_exec_state_replace(struct task_struct *tsk,
+ struct task_exec_state *exec_state);
+int task_exec_state_copy(struct task_struct *tsk);
+void __init exec_state_init(void);
+
+DEFINE_FREE(put_task_exec_state, struct task_exec_state *, put_task_exec_state(_T))
+
+#endif /* _LINUX_SCHED_EXEC_STATE_H */