From 12ae2c81b21cfaa193db2faf035d495807edc3a7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:50:59 +0100 Subject: clone: add CLONE_AUTOREAP Add a new clone3() flag CLONE_AUTOREAP that makes a child process auto-reap on exit without ever becoming a zombie. This is a per-process property in contrast to the existing auto-reap mechanism via SA_NOCLDWAIT or SIG_IGN for SIGCHLD which applies to all children of a given parent. Currently the only way to automatically reap children is to set SA_NOCLDWAIT or SIG_IGN on SIGCHLD. This is a parent-scoped property affecting all children which makes it unsuitable for libraries or applications that need selective auto-reaping of specific children while still being able to wait() on others. CLONE_AUTOREAP stores an autoreap flag in the child's signal_struct. When the child exits do_notify_parent() checks this flag and causes exit_notify() to transition the task directly to EXIT_DEAD. Since the flag lives on the child it survives reparenting: if the original parent exits and the child is reparented to a subreaper or init the child still auto-reaps when it eventually exits. CLONE_AUTOREAP can be combined with CLONE_PIDFD to allow the parent to monitor the child's exit via poll() and retrieve exit status via PIDFD_GET_INFO. Without CLONE_PIDFD it provides a fire-and-forget pattern where the parent simply doesn't care about the child's exit status. No exit signal is delivered so exit_signal must be zero. CLONE_AUTOREAP is rejected in combination with CLONE_PARENT. If a CLONE_AUTOREAP child were to clone(CLONE_PARENT) the new grandchild would inherit exit_signal == 0 from the autoreap parent's group leader but without signal->autoreap. This grandchild would become a zombie that never sends a signal and is never autoreaped - confusing and arguably broken behavior. The flag is not inherited by the autoreap process's own children. Each child that should be autoreaped must be explicitly created with CLONE_AUTOREAP. Link: https://github.com/uapi-group/kernel-features/issues/45 Link: https://patch.msgid.link/20260226-work-pidfs-autoreap-v5-1-d148b984a989@kernel.org Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- include/uapi/linux/sched.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 359a14cc76a4..69f7b4f9eb0c 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -34,8 +34,9 @@ #define CLONE_IO 0x80000000 /* Clone io context */ /* Flags for the clone3() syscall. */ -#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ -#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */ +#define CLONE_CLEAR_SIGHAND (1ULL << 32) /* Clear any signal handler and reset to SIG_DFL. */ +#define CLONE_INTO_CGROUP (1ULL << 33) /* Clone into a specific cgroup given the right permissions. */ +#define CLONE_AUTOREAP (1ULL << 34) /* Auto-reap child on exit. */ /* * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 -- cgit v1.2.3 From 24baca56fafc33d4fb77cd9858a48c734183cb22 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:51:00 +0100 Subject: clone: add CLONE_NNP Add a new clone3() flag CLONE_NNP that sets no_new_privs on the child process at clone time. This is analogous to prctl(PR_SET_NO_NEW_PRIVS) but applied at process creation rather than requiring a separate step after the child starts running. CLONE_NNP is rejected with CLONE_THREAD. It's conceptually a lot simpler if the whole thread-group is forced into NNP and not have single threads running around with NNP. Link: https://patch.msgid.link/20260226-work-pidfs-autoreap-v5-2-d148b984a989@kernel.org Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- include/uapi/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 69f7b4f9eb0c..386c8d7e89cb 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -37,6 +37,7 @@ #define CLONE_CLEAR_SIGHAND (1ULL << 32) /* Clear any signal handler and reset to SIG_DFL. */ #define CLONE_INTO_CGROUP (1ULL << 33) /* Clone into a specific cgroup given the right permissions. */ #define CLONE_AUTOREAP (1ULL << 34) /* Auto-reap child on exit. */ +#define CLONE_NNP (1ULL << 35) /* Set no_new_privs on child. */ /* * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 -- cgit v1.2.3 From c8134b5f13ae959de2b3c8cc278e2602b0857345 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:51:01 +0100 Subject: pidfd: add CLONE_PIDFD_AUTOKILL Add a new clone3() flag CLONE_PIDFD_AUTOKILL that ties a child's lifetime to the pidfd returned from clone3(). When the last reference to the struct file created by clone3() is closed the kernel sends SIGKILL to the child. A pidfd obtained via pidfd_open() for the same process does not keep the child alive and does not trigger autokill - only the specific struct file from clone3() has this property. This is useful for container runtimes, service managers, and sandboxed subprocess execution - any scenario where the child must die if the parent crashes or abandons the pidfd. CLONE_PIDFD_AUTOKILL requires both CLONE_PIDFD (the whole point is tying lifetime to the pidfd file) and CLONE_AUTOREAP (a killed child with no one to reap it would become a zombie). CLONE_THREAD is rejected because autokill targets a process not a thread. The clone3 pidfd is identified by the PIDFD_AUTOKILL file flag set on the struct file at clone3() time. The pidfs .release handler checks this flag and sends SIGKILL via do_send_sig_info(SIGKILL, SEND_SIG_PRIV, ...) only when it is set. Files from pidfd_open() or open_by_handle_at() are distinct struct files that do not carry this flag. dup()/fork() share the same struct file so they extend the child's lifetime until the last reference drops. CLONE_PIDFD_AUTOKILL uses a privilege model based on CLONE_NNP: without CLONE_NNP the child could escalate privileges via setuid/setgid exec after being spawned, so the caller must have CAP_SYS_ADMIN in its user namespace. With CLONE_NNP the child can never gain new privileges so unprivileged usage is allowed. This is a deliberate departure from the pdeath_signal model which is reset during secureexec and commit_creds() rendering it useless for container runtimes that need to deprivilege themselves. Link: https://patch.msgid.link/20260226-work-pidfs-autoreap-v5-3-d148b984a989@kernel.org Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- include/uapi/linux/pidfd.h | 1 + include/uapi/linux/sched.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index ea9a6811fc76..9281956a9f32 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -13,6 +13,7 @@ #ifdef __KERNEL__ #include #define PIDFD_STALE CLONE_PIDFD +#define PIDFD_AUTOKILL O_TRUNC #endif /* Flags for pidfd_send_signal(). */ diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 386c8d7e89cb..149dbc64923b 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -38,6 +38,7 @@ #define CLONE_INTO_CGROUP (1ULL << 33) /* Clone into a specific cgroup given the right permissions. */ #define CLONE_AUTOREAP (1ULL << 34) /* Auto-reap child on exit. */ #define CLONE_NNP (1ULL << 35) /* Set no_new_privs on child. */ +#define CLONE_PIDFD_AUTOKILL (1ULL << 36) /* Kill child when clone pidfd closes. */ /* * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 -- cgit v1.2.3 From 701f7f4fbabbf4989ba6fbf033b160dd943221d5 Mon Sep 17 00:00:00 2001 From: Emanuele Rocca Date: Mon, 23 Mar 2026 14:02:16 +0100 Subject: pidfds: add coredump_code field to pidfd_info The struct pidfd_info currently exposes in a field called coredump_signal the signal number (si_signo) that triggered the dump (for example, 11 for SIGSEGV). However, it is also valuable to understand the reason why that signal was sent. This additional context is provided by the signal code (si_code), such as 2 for SEGV_ACCERR. Add a new field to struct pidfd_info called coredump_code with the value of si_code for the benefit of sysadmins who pipe core dumps to user-space programs for later analysis. The following snippet illustrates a simplified C program that consumes coredump_signal and coredump_code, and then logs core dump signals and codes to a file: int pidfd = (int)atoi(argv[1]); struct pidfd_info info = { .mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP, }; if (ioctl(pidfd, PIDFD_GET_INFO, &info) == 0) if (info.mask & PIDFD_INFO_COREDUMP) fprintf(f, "PID=%d, si_signo: %d si_code: %d\n", info.pid, info.coredump_signal, info.coredump_code); Assuming the program is installed under /usr/local/bin/core-logger, core dump processing can be enabled by setting /proc/sys/kernel/core_pattern to '|/usr/local/bin/dumpstuff %F'. systemd-coredump(8) already uses pidfds to process core dumps, and it could be extended to include the values of coredump_code too. Signed-off-by: Emanuele Rocca Link: https://patch.msgid.link/acE52HIFivNZN3nE@NH27D9T0LF Acked-by: Oleg Nesterov Signed-off-by: Christian Brauner --- include/uapi/linux/pidfd.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index 9281956a9f32..0919246a1611 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -29,10 +29,12 @@ #define PIDFD_INFO_COREDUMP (1UL << 4) /* Only returned if requested. */ #define PIDFD_INFO_SUPPORTED_MASK (1UL << 5) /* Want/got supported mask flags */ #define PIDFD_INFO_COREDUMP_SIGNAL (1UL << 6) /* Always returned if PIDFD_INFO_COREDUMP is requested. */ +#define PIDFD_INFO_COREDUMP_CODE (1UL << 7) /* Always returned if PIDFD_INFO_COREDUMP is requested. */ #define PIDFD_INFO_SIZE_VER0 64 /* sizeof first published struct */ #define PIDFD_INFO_SIZE_VER1 72 /* sizeof second published struct */ #define PIDFD_INFO_SIZE_VER2 80 /* sizeof third published struct */ +#define PIDFD_INFO_SIZE_VER3 88 /* sizeof fourth published struct */ /* * Values for @coredump_mask in pidfd_info. @@ -99,6 +101,8 @@ struct pidfd_info { struct /* coredump info */ { __u32 coredump_mask; __u32 coredump_signal; + __u32 coredump_code; + __u32 coredump_pad; /* align supported_mask to 8 bytes */ }; __u64 supported_mask; /* Mask flags that this kernel supports */ }; -- cgit v1.2.3