diff options
| author | Christian Brauner <brauner@kernel.org> | 2026-05-21 08:46:22 +0200 |
|---|---|---|
| committer | Christian Brauner <brauner@kernel.org> | 2026-05-26 11:02:02 +0200 |
| commit | 4425cd76b5e73ce92bea9dc61a0027ef3d55c9f0 (patch) | |
| tree | 4a0cefcb5798c8b96c2481271e3edd2ea9d08558 /include/linux | |
| parent | 5200f5f493f79f14bbdc349e402a40dfb32f23c8 (diff) | |
| parent | 6b1c66c9cca99bf00386481c7b2aa7394c26d8b8 (diff) | |
Merge patch series "exec: introduce task_exec_state for exec-time metadata"
Christian Brauner (Amutable) <brauner@kernel.org> says:
This series relocates the dumpable mode and the user_namespace
captured at execve() from mm_struct onto a new per-task
task_exec_state structure that stays attached to the task for its
full lifetime.
__ptrace_may_access() and several /proc owner / visibility checks
need to consult two pieces of state for any observable task,
including zombies that have already gone through exit_mm(): the
dumpable mode and the user namespace captured at execve(). Both
live on mm_struct today, which exit_mm() clears from the task long
before the task is reaped.
A reader that races with do_exit() observes task->mm == NULL and
either fails the check or falls back to init_user_ns - which denies
legitimate access to non-dumpable zombies that were running in a
nested user namespace.
mm_struct loses ->user_ns and the dumpability bits in ->flags.
MMF_DUMPABLE_BITS is reserved so MMF_DUMP_FILTER_* layout exposed via
/proc/<pid>/coredump_filter stays stable. task->user_dumpable and its
exit_mm() snapshot are removed.
task_exec_state is the privilege domain established by an execve()
[1]. Within a thread group it is shared via refcount; across thread
groups each task has its own:
- CLONE_VM siblings (thread-group members, io_uring workers)
refcount-share the parent's exec_state.
- Non-CLONE_VM clones (fork(), vfork() without CLONE_VM)
allocate a fresh exec_state inheriting the parent's dumpable
mode and user_ns.
- execve() in the child allocates a fresh instance and installs
it under task_lock + exec_update_lock via
task_exec_state_replace().
- Credential changes (setresuid, capset, ...) and
prctl(PR_SET_DUMPABLE) update dumpability on the current
task's exec_state, i.e. on the thread group's shared instance.
Behavioral change:
Kernel threads that briefly use a user mm via kthread_use_mm() no
longer inherit dumpability from the borrowed mm. Kthreads are not
ptraceable (PF_KTHREAD short-circuits __ptrace_may_access), so this
is observable only via /proc surfaces that a sufficiently privileged
reader can reach.
[1] https://lore.kernel.org/r/CAHk-=wj+NgoDH3GSicJ140SV8OoDd71pLmL3fgFEsTcgoMC6Og@mail.gmail.com
* patches from https://patch.msgid.link/20260520-work-task_exec_state-v3-0-69f895bc1385@kernel.org:
exec_state: relocate dumpable information
ptrace: add ptracer_access_allowed()
exec: introduce struct task_exec_state
sched/coredump: introduce enum task_dumpable
Link: https://patch.msgid.link/20260520-work-task_exec_state-v3-0-69f895bc1385@kernel.org
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
Diffstat (limited to 'include/linux')
| -rw-r--r-- | include/linux/binfmts.h | 2 | ||||
| -rw-r--r-- | include/linux/coredump.h | 4 | ||||
| -rw-r--r-- | include/linux/mm_types.h | 9 | ||||
| -rw-r--r-- | include/linux/ptrace.h | 1 | ||||
| -rw-r--r-- | include/linux/sched.h | 6 | ||||
| -rw-r--r-- | include/linux/sched/coredump.h | 47 | ||||
| -rw-r--r-- | include/linux/sched/exec_state.h | 31 |
7 files changed, 56 insertions, 44 deletions
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 65abd5ab8836..a8379f4eee61 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -25,6 +25,8 @@ struct linux_binprm { struct page *page[MAX_ARG_PAGES]; #endif struct mm_struct *mm; + /* user_ns published to task->exec_state at execve, narrowed by would_dump(). */ + struct user_namespace *user_ns; unsigned long p; /* current top of mem */ unsigned int /* Should an execfd be passed to userspace? */ diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 68861da4cf7c..7b38ee2e7913 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -5,6 +5,7 @@ #include <linux/types.h> #include <linux/mm.h> #include <linux/fs.h> +#include <linux/sched/coredump.h> #include <asm/siginfo.h> #ifdef CONFIG_COREDUMP @@ -20,7 +21,10 @@ struct coredump_params { const kernel_siginfo_t *siginfo; struct file *file; unsigned long limit; + /* MMF_DUMP_FILTER_* bits, snapshot of mm->flags at dump start. */ unsigned long mm_flags; + /* Snapshot of dumpable at dump start. */ + enum task_dumpable dumpable; int cpu; loff_t written; loff_t pos; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a308e2c23b82..9588ce3b16df 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1342,7 +1342,6 @@ struct mm_struct { */ struct task_struct __rcu *owner; #endif - struct user_namespace *user_ns; /* store ref to file /proc/<pid>/exe symlink points to */ struct file __rcu *exe_file; @@ -1907,11 +1906,11 @@ enum { /* mm flags */ /* - * The first two bits represent core dump modes for set-user-ID, - * the modes are SUID_DUMP_* defined in linux/sched/coredump.h + * Bits 0 and 1 were dumpability; that moved to task->exec_state. Reserve + * the bits so MMF_DUMP_FILTER_* positions stay stable for the + * /proc/<pid>/coredump_filter ABI. */ #define MMF_DUMPABLE_BITS 2 -#define MMF_DUMPABLE_MASK (BIT(MMF_DUMPABLE_BITS) - 1) /* coredump filter bits */ #define MMF_DUMP_ANON_PRIVATE 2 #define MMF_DUMP_ANON_SHARED 3 @@ -1972,7 +1971,7 @@ enum { #define MMF_TOPDOWN 31 /* mm searches top down by default */ #define MMF_TOPDOWN_MASK BIT(MMF_TOPDOWN) -#define MMF_INIT_LEGACY_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ +#define MMF_INIT_LEGACY_MASK (MMF_DUMP_FILTER_MASK |\ MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK) diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 90507d4afcd6..ef314f7a9ecc 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -17,6 +17,7 @@ struct syscall_info { struct seccomp_data data; }; +bool ptracer_access_allowed(struct task_struct *tsk); extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); diff --git a/include/linux/sched.h b/include/linux/sched.h index ee06cba5c6f5..258cb075478d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -85,6 +85,7 @@ struct seq_file; struct sighand_struct; struct signal_struct; struct task_delay_info; +struct task_exec_state; struct task_group; struct task_struct; struct timespec64; @@ -962,6 +963,8 @@ struct task_struct { struct mm_struct *mm; struct mm_struct *active_mm; + struct task_exec_state __rcu *exec_state; + int exit_state; int exit_code; int exit_signal; @@ -1002,9 +1005,6 @@ struct task_struct { unsigned sched_rt_mutex:1; #endif - /* Save user-dumpable when mm goes away */ - unsigned user_dumpable:1; - /* Bit to tell TOMOYO we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 624fda17a785..20957ccde3b5 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -2,43 +2,18 @@ #ifndef _LINUX_SCHED_COREDUMP_H #define _LINUX_SCHED_COREDUMP_H -#include <linux/mm_types.h> - -#define SUID_DUMP_DISABLE 0 /* No setuid dumping */ -#define SUID_DUMP_USER 1 /* Dump as user of process */ -#define SUID_DUMP_ROOT 2 /* Dump as root */ - -static inline unsigned long __mm_flags_get_dumpable(const struct mm_struct *mm) -{ - /* - * By convention, dumpable bits are contained in first 32 bits of the - * bitmap, so we can simply access this first unsigned long directly. - */ - return __mm_flags_get_word(mm); -} - -static inline void __mm_flags_set_mask_dumpable(struct mm_struct *mm, int value) -{ - __mm_flags_set_mask_bits_word(mm, MMF_DUMPABLE_MASK, value); -} - -extern void set_dumpable(struct mm_struct *mm, int value); /* - * This returns the actual value of the suid_dumpable flag. For things - * that are using this for checking for privilege transitions, it must - * test against SUID_DUMP_USER rather than treating it as a boolean - * value. + * Task dumpability mode. Gates core dump production and ptrace_attach() + * authorization. The numeric values are stable ABI (suid_dumpable + * sysctl, prctl(PR_SET_DUMPABLE)); do not renumber. */ -static inline int __get_dumpable(unsigned long mm_flags) -{ - return mm_flags & MMF_DUMPABLE_MASK; -} - -static inline int get_dumpable(struct mm_struct *mm) -{ - unsigned long flags = __mm_flags_get_dumpable(mm); - - return __get_dumpable(flags); -} +enum task_dumpable { + TASK_DUMPABLE_OFF = 0, /* no dump; ptrace needs CAP_SYS_PTRACE */ + TASK_DUMPABLE_OWNER = 1, /* default; dump and ptrace by uid match */ + TASK_DUMPABLE_ROOT = 2, /* dump as root; ptrace needs CAP_SYS_PTRACE */ +}; + +void task_exec_state_set_dumpable(enum task_dumpable value); +enum task_dumpable task_exec_state_get_dumpable(struct task_struct *task); #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/include/linux/sched/exec_state.h b/include/linux/sched/exec_state.h new file mode 100644 index 000000000000..9b61782510b8 --- /dev/null +++ b/include/linux/sched/exec_state.h @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */ +#ifndef _LINUX_SCHED_EXEC_STATE_H +#define _LINUX_SCHED_EXEC_STATE_H + +#include <linux/init.h> +#include <linux/rcupdate.h> +#include <linux/refcount.h> +#include <linux/sched/coredump.h> +#include <linux/user_namespace.h> + +struct task_exec_state { + refcount_t count; + enum task_dumpable dumpable; + struct user_namespace *user_ns; + struct rcu_head rcu; +}; + +extern struct task_exec_state init_task_exec_state; + +struct task_exec_state *alloc_task_exec_state(struct user_namespace *user_ns); +void put_task_exec_state(struct task_exec_state *exec_state); +struct task_exec_state *task_exec_state_rcu(const struct task_struct *tsk); +struct task_exec_state *task_exec_state_replace(struct task_struct *tsk, + struct task_exec_state *exec_state); +int task_exec_state_copy(struct task_struct *tsk); +void __init exec_state_init(void); + +DEFINE_FREE(put_task_exec_state, struct task_exec_state *, put_task_exec_state(_T)) + +#endif /* _LINUX_SCHED_EXEC_STATE_H */ |
