aboutsummaryrefslogtreecommitdiff
path: root/kernel/trace
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-06-17 09:18:14 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2026-06-17 09:18:14 +0100
commit9c87e61e3c5797277407ba5eae4eac8a52be3fa3 (patch)
treee3f902cb5363b5b90ab74a4b7e26fafbc15aaeaf /kernel/trace
parentb85966adbf5de0668a815c6e3527f87e0c387fb4 (diff)
parente4287bf34f97a88c7d9322f5bde828724c073a6b (diff)
Merge tag 'bpf-next-7.2' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Pull bpf updates from Alexei Starovoitov: "Major changes: - Recover from BPF arena page faults using a scratch page and add ptep_try_set() for lockless empty-slot installs on x86 and arm64. This allows BPF kfuncs to access arena pointers directly. The 'arena_direct_access' stable branch was created for this work and was pulled into sched-ext and bpf-next trees (Tejun Heo, Kumar Kartikeya Dwivedi) - Lift old restriction and support 6+ arguments in BPF programs and kfuncs on x86 and arm64 (Yonghong Song, Puranjay Mohan) Other features and fixes: - Add 24-bit BTF vlen and reclaim unused bits in the BTF UAPI to ease addition of new BTF kinds (Alan Maguire) - Raise the maximum BPF call chain depth from 8 to 16 frames (Alexei Starovoitov) - Refactor object relationship tracking in the verifier and fix a dynptr use-after-free bug (Amery Hung) - Harden the signed program loader and reject exclusive maps as inner maps (Daniel Borkmann) - Replace the verifier min/max bounds fields with a circular number (cnum) representation and improve 32->64 bit range refinements (Eduard Zingerman) - Introduce the arena library and runtime (libarena) with a buddy allocator, rbtree and SPMC queue data structures, ASAN support and a parallel test harness. Allow subprograms to return arena pointers and switch to a BTF type-tag based __arena annotation (Emil Tsalapatis) - Cache build IDs in the sleepable stackmap path and avoid faultable build ID reads under mm locks (Ihor Solodrai) - Introduce the tracing_multi link to attach a single BPF program to many kernel functions at once. Allow specifying the uprobe_multi target via FD (Jiri Olsa) - Extend the bpf_list family of kfuncs with bpf_list_add/del(), and bpf_list_is_first/is_last/empty() (Kaitao Cheng) - Extend the BPF syscall with common attributes support for prog_load, btf_load and map_create (Leon Hwang) - Wrap rhashtable as BPF map (Mykyta Yatsenko, Herbert Xu) - Add sleepable support for tracepoint programs and fix deadlocks in LRU map due to NMI reentry (Mykyta Yatsenko) - Fix OOB access in bpf_flow_keys, fix nullness analysis of inner arrays, enforce write checks for global subprograms (Nuoqi Gui) - Report the maximum combined stack depth and print a breakdown of instructions processed per subprogram (Paul Chaignon) - Add an XDP load-balancer benchmark and arm64 JIT support for stack arguments (Puranjay Mohan) - Add kfuncs to traverse over wakeup_sources (Samuel Wu) - Allow sleepable BPF programs to use LPM trie maps directly (Vlad Poenaru) - Many more fixes and cleanups across the verifier, BTF, sockmap, devmap, bpffs, security hooks, s390/riscv/loongarch JITs, rqspinlock, libbpf, bpftool, selftests" * tag 'bpf-next-7.2' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (336 commits) selftests/bpf: Work around llvm stack overflow in crypto progs selftests/bpf: add test for bpf_msg_pop_data() overflow bpf, sockmap: fix integer overflow in bpf_msg_pop_data() bounds check sockmap: Fix use-after-free in udp_bpf_recvmsg() bpf, sockmap: keep sk_msg copy state in sync bpf, sockmap: Fix wrong rsge offset in bpf_msg_push_data() bpf, sockmap: reject overflowing copy + len in bpf_msg_push_data() selftsets/bpf: Retry map update on helper_fill_hashmap() selftests/bpf: Add test for sleepable lsm_cgroup rejection selftests/bpf: Add test to verify the fix for bpf_setsockopt() helper bpf: Fix bpf_get/setsockopt to tos for ipv4-mapped ipv6 socket selftests/bpf: Avoid static LLVM linking for cross builds selftests/bpf: Use common CFLAGS for urandom_read selftests/bpf: Initialize operation name before use tools/bpf: build: Append extra cflags libbpf: Initialize CFLAGS before including Makefile.include bpftool: Append extra host flags bpftool: Avoid adding EXTRA_CFLAGS to HOST_CFLAGS bpftool: Pass host flags to bootstrap libbpf selftests/bpf: correct CONFIG_PPC64 macro name in comment ...
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/bpf_trace.c334
-rw-r--r--kernel/trace/ftrace.c35
-rw-r--r--kernel/trace/trace_syscalls.c110
3 files changed, 393 insertions, 86 deletions
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a02bd258677e..82f8feea6931 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -23,6 +23,7 @@
#include <linux/sort.h>
#include <linux/key.h>
#include <linux/namei.h>
+#include <linux/file.h>
#include <net/bpf_sk_storage.h>
@@ -42,6 +43,7 @@
#define MAX_UPROBE_MULTI_CNT (1U << 20)
#define MAX_KPROBE_MULTI_CNT (1U << 20)
+#define MAX_TRACING_MULTI_CNT (1U << 20)
#ifdef CONFIG_MODULES
struct bpf_trace_module {
@@ -152,6 +154,34 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
return ret;
}
+/**
+ * trace_call_bpf_faultable - invoke BPF program in faultable context
+ * @call: tracepoint event
+ * @ctx: opaque context pointer
+ *
+ * Variant of trace_call_bpf() for faultable tracepoints (syscall
+ * tracepoints). Supports sleepable BPF programs by using rcu_tasks_trace
+ * for lifetime protection and bpf_prog_run_array_sleepable() for per-program
+ * RCU flavor selection, following the uprobe pattern.
+ *
+ * Per-program recursion protection is provided by
+ * bpf_prog_run_array_sleepable(). Global bpf_prog_active is not
+ * needed because syscall tracepoints cannot self-recurse.
+ *
+ * Must be called from a faultable/preemptible context.
+ */
+unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx)
+{
+ struct bpf_prog_array *prog_array;
+
+ might_fault();
+ guard(rcu_tasks_trace)();
+
+ prog_array = rcu_dereference_check(call->prog_array,
+ rcu_read_lock_trace_held());
+ return bpf_prog_run_array_sleepable(prog_array, ctx, bpf_prog_run);
+}
+
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
{
@@ -1305,7 +1335,8 @@ static inline bool is_uprobe_session(const struct bpf_prog *prog)
static inline bool is_trace_fsession(const struct bpf_prog *prog)
{
return prog->type == BPF_PROG_TYPE_TRACING &&
- prog->expected_attach_type == BPF_TRACE_FSESSION;
+ (prog->expected_attach_type == BPF_TRACE_FSESSION ||
+ prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI);
}
static const struct bpf_func_proto *
@@ -2072,11 +2103,19 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
static __always_inline
void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
{
+ struct srcu_ctr __percpu *scp = NULL;
struct bpf_prog *prog = link->link.prog;
+ bool sleepable = prog->sleepable;
struct bpf_run_ctx *old_run_ctx;
struct bpf_trace_run_ctx run_ctx;
- rcu_read_lock_dont_migrate();
+ if (sleepable) {
+ scp = rcu_read_lock_tasks_trace();
+ migrate_disable();
+ } else {
+ rcu_read_lock_dont_migrate();
+ }
+
if (unlikely(!bpf_prog_get_recursion_context(prog))) {
bpf_prog_inc_misses_counter(prog);
goto out;
@@ -2085,12 +2124,18 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
run_ctx.bpf_cookie = link->cookie;
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
- (void) bpf_prog_run(prog, args);
+ (void)bpf_prog_run(prog, args);
bpf_reset_run_ctx(old_run_ctx);
out:
bpf_prog_put_recursion_context(prog);
- rcu_read_unlock_migrate();
+
+ if (sleepable) {
+ migrate_enable();
+ rcu_read_unlock_tasks_trace(scp);
+ } else {
+ rcu_read_unlock_migrate();
+ }
}
#define UNPACK(...) __VA_ARGS__
@@ -3170,6 +3215,38 @@ static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx)
return run_ctx->uprobe->cookie;
}
+static int bpf_uprobe_multi_get_path(const union bpf_attr *attr, struct path *path)
+{
+ void __user *upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path);
+ u32 path_fd = attr->link_create.uprobe_multi.path_fd;
+ u32 flags = attr->link_create.uprobe_multi.flags;
+
+ if (flags & BPF_F_UPROBE_MULTI_PATH_FD) {
+ /*
+ * When BPF_F_UPROBE_MULTI_PATH_FD is set, the executable is
+ * identified by path_fd, upath must be NULL.
+ */
+ if (upath)
+ return -EINVAL;
+
+ CLASS(fd, f)(path_fd);
+ if (fd_empty(f))
+ return -EBADF;
+ *path = fd_file(f)->f_path;
+ path_get(path);
+ return 0;
+ }
+
+ /*
+ * When BPF_F_UPROBE_MULTI_PATH_FD is not set, the path is resolved
+ * relative to the cwd (AT_FDCWD) or absolute using the upath string.
+ */
+ if (!upath || path_fd)
+ return -EINVAL;
+
+ return user_path_at(AT_FDCWD, upath, LOOKUP_FOLLOW, path);
+}
+
int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
struct bpf_uprobe_multi_link *link = NULL;
@@ -3179,10 +3256,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
struct task_struct *task = NULL;
unsigned long __user *uoffsets;
u64 __user *ucookies;
- void __user *upath;
+ unsigned long size;
u32 flags, cnt, i;
struct path path;
- char *name;
pid_t pid;
int err;
@@ -3197,19 +3273,18 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
return -EINVAL;
flags = attr->link_create.uprobe_multi.flags;
- if (flags & ~BPF_F_UPROBE_MULTI_RETURN)
+ if (flags & ~(BPF_F_UPROBE_MULTI_RETURN | BPF_F_UPROBE_MULTI_PATH_FD))
return -EINVAL;
/*
- * path, offsets and cnt are mandatory,
+ * offsets and cnt are mandatory,
* ref_ctr_offsets and cookies are optional
*/
- upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path);
uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets);
cnt = attr->link_create.uprobe_multi.cnt;
pid = attr->link_create.uprobe_multi.pid;
- if (!upath || !uoffsets || !cnt || pid < 0)
+ if (!uoffsets || !cnt || pid < 0)
return -EINVAL;
if (cnt > MAX_UPROBE_MULTI_CNT)
return -E2BIG;
@@ -3217,14 +3292,17 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets);
ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies);
- name = strndup_user(upath, PATH_MAX);
- if (IS_ERR(name)) {
- err = PTR_ERR(name);
- return err;
- }
+ /*
+ * All uoffsets/uref_ctr_offsets/ucookies arrays have the same value
+ * size, we need to check their address range is safe for __get_user
+ * calls.
+ */
+ size = sizeof(*uoffsets) * cnt;
+ if (!access_ok(uoffsets, size) || !access_ok(uref_ctr_offsets, size) ||
+ !access_ok(ucookies, size))
+ return -EFAULT;
- err = kern_path(name, LOOKUP_FOLLOW, &path);
- kfree(name);
+ err = bpf_uprobe_multi_get_path(attr, &path);
if (err)
return err;
@@ -3398,12 +3476,12 @@ typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struc
* direct calls into all the specific callback implementations
* (copy_user_data_sleepable, copy_user_data_nofault, and so on)
*/
-static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size,
+static __always_inline int __bpf_dynptr_copy_str(const struct bpf_dynptr *dptr, u64 doff, u64 size,
const void *unsafe_src,
copy_fn_t str_copy_fn,
struct task_struct *tsk)
{
- struct bpf_dynptr_kern *dst;
+ const struct bpf_dynptr_kern *dst;
u64 chunk_sz, off;
void *dst_slice;
int cnt, err;
@@ -3439,7 +3517,7 @@ static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64
u64 size, const void *unsafe_src,
copy_fn_t copy_fn, struct task_struct *tsk)
{
- struct bpf_dynptr_kern *dst;
+ const struct bpf_dynptr_kern *dst;
void *dst_slice;
char buf[256];
u64 off, chunk_sz;
@@ -3540,49 +3618,49 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid
return bpf_send_signal_common(sig, type, task, value);
}
-__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_probe_read_user_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_data_nofault, NULL);
}
-__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_probe_read_kernel_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void *unsafe_ptr__ign)
{
return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign,
copy_kernel_data_nofault, NULL);
}
-__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_probe_read_user_str_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_str_nofault, NULL);
}
-__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void *unsafe_ptr__ign)
{
return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign,
copy_kernel_str_nofault, NULL);
}
-__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_copy_from_user_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_data_sleepable, NULL);
}
-__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_copy_from_user_str_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_str_sleepable, NULL);
}
-__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_copy_from_user_task_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign,
struct task_struct *tsk)
{
@@ -3590,7 +3668,7 @@ __bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off,
copy_user_data_sleepable, tsk);
}
-__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign,
struct task_struct *tsk)
{
@@ -3599,3 +3677,203 @@ __bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64
}
__bpf_kfunc_end_defs();
+
+#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) && \
+ defined(CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS)
+
+static void bpf_tracing_multi_link_release(struct bpf_link *link)
+{
+ struct bpf_tracing_multi_link *tr_link =
+ container_of(link, struct bpf_tracing_multi_link, link);
+
+ WARN_ON_ONCE(bpf_trampoline_multi_detach(link->prog, tr_link));
+}
+
+static void bpf_tracing_multi_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_tracing_multi_link *tr_link =
+ container_of(link, struct bpf_tracing_multi_link, link);
+
+ kvfree(tr_link->fexits);
+ kvfree(tr_link->cookies);
+ kvfree(tr_link);
+}
+
+#ifdef CONFIG_PROC_FS
+static void bpf_tracing_multi_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_tracing_multi_link *tr_link =
+ container_of(link, struct bpf_tracing_multi_link, link);
+ bool has_cookies = !!tr_link->cookies;
+
+ seq_printf(seq, "attach_type:\t%u\n", tr_link->link.attach_type);
+ seq_printf(seq, "cnt:\t%u\n", tr_link->nodes_cnt);
+
+ seq_printf(seq, "%s\t %s\t %s\t %s\n", "obj-id", "btf-id", "cookie", "func");
+ for (int i = 0; i < tr_link->nodes_cnt; i++) {
+ struct bpf_tracing_multi_node *mnode = &tr_link->nodes[i];
+ u32 btf_id, obj_id;
+
+ bpf_trampoline_unpack_key(mnode->trampoline->key, &obj_id, &btf_id);
+ seq_printf(seq, "%u\t %u\t %llu\t %pS\n",
+ obj_id, btf_id,
+ has_cookies ? tr_link->cookies[i] : 0,
+ (void *) mnode->trampoline->ip);
+
+ cond_resched();
+ }
+}
+#endif
+
+static const struct bpf_link_ops bpf_tracing_multi_link_lops = {
+ .release = bpf_tracing_multi_link_release,
+ .dealloc_deferred = bpf_tracing_multi_link_dealloc,
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_tracing_multi_show_fdinfo,
+#endif
+};
+
+static int ids_cmp_r(const void *pa, const void *pb, const void *priv __maybe_unused)
+{
+ u32 a = *(u32 *) pa;
+ u32 b = *(u32 *) pb;
+
+ return (a > b) - (a < b);
+}
+
+static void ids_swap_r(void *a, void *b, int size __maybe_unused,
+ const void *priv __maybe_unused)
+{
+ u64 *cookie_a, *cookie_b, *cookies;
+ u32 *id_a = a, *id_b = b, *ids;
+ void **data = (void **) priv;
+
+ ids = data[0];
+ cookies = data[1];
+
+ if (cookies) {
+ cookie_a = cookies + (id_a - ids);
+ cookie_b = cookies + (id_b - ids);
+ swap(*cookie_a, *cookie_b);
+ }
+ swap(*id_a, *id_b);
+}
+
+static int check_dup_ids(u32 *ids, u64 *cookies, u32 cnt)
+{
+ void *data[2] = { ids, cookies };
+ int err = 0;
+
+ /*
+ * Sort ids array (together with cookies array if defined)
+ * and check it for duplicates. The ids and cookies arrays
+ * are left sorted.
+ */
+ sort_r_nonatomic(ids, cnt, sizeof(ids[0]), ids_cmp_r, ids_swap_r, data);
+
+ for (int i = 1; i < cnt; i++) {
+ if (ids[i] == ids[i - 1]) {
+ err = -EINVAL;
+ break;
+ }
+ }
+ return err;
+}
+
+int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr)
+{
+ struct bpf_tracing_multi_link *link = NULL;
+ struct bpf_tramp_node *fexits = NULL;
+ struct bpf_link_primer link_primer;
+ u32 cnt, *ids = NULL;
+ u64 __user *ucookies;
+ u64 *cookies = NULL;
+ u32 __user *uids;
+ int err;
+
+ uids = u64_to_user_ptr(attr->link_create.tracing_multi.ids);
+ cnt = attr->link_create.tracing_multi.cnt;
+
+ if (!cnt || !uids)
+ return -EINVAL;
+ if (cnt > MAX_TRACING_MULTI_CNT)
+ return -E2BIG;
+ if (attr->link_create.flags || attr->link_create.target_fd)
+ return -EINVAL;
+
+ ids = kvmalloc_objs(*ids, cnt);
+ if (!ids)
+ return -ENOMEM;
+
+ if (copy_from_user(ids, uids, cnt * sizeof(*ids))) {
+ err = -EFAULT;
+ goto error;
+ }
+
+ ucookies = u64_to_user_ptr(attr->link_create.tracing_multi.cookies);
+ if (ucookies) {
+ cookies = kvmalloc_objs(*cookies, cnt);
+ if (!cookies) {
+ err = -ENOMEM;
+ goto error;
+ }
+ if (copy_from_user(cookies, ucookies, cnt * sizeof(*cookies))) {
+ err = -EFAULT;
+ goto error;
+ }
+ }
+
+ err = check_dup_ids(ids, cookies, cnt);
+ if (err)
+ goto error;
+
+ if (prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) {
+ fexits = kvmalloc_objs(*fexits, cnt);
+ if (!fexits) {
+ err = -ENOMEM;
+ goto error;
+ }
+ }
+
+ link = kvzalloc_flex(*link, nodes, cnt);
+ if (!link) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING_MULTI,
+ &bpf_tracing_multi_link_lops, prog, prog->expected_attach_type);
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto error;
+
+ link->nodes_cnt = cnt;
+ link->cookies = cookies;
+ link->fexits = fexits;
+
+ err = bpf_trampoline_multi_attach(prog, ids, link);
+ kvfree(ids);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ return err;
+ }
+ return bpf_link_settle(&link_primer);
+
+error:
+ kvfree(fexits);
+ kvfree(cookies);
+ kvfree(ids);
+ kvfree(link);
+ return err;
+}
+
+#else
+
+int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr)
+{
+ return -EOPNOTSUPP;
+}
+
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS && CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b2611de3f594..f93e34dd2328 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1198,8 +1198,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
return __ftrace_lookup_ip(hash, ip);
}
-static void __add_hash_entry(struct ftrace_hash *hash,
- struct ftrace_func_entry *entry)
+void add_ftrace_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry)
{
struct hlist_head *hhd;
unsigned long key;
@@ -1221,7 +1220,7 @@ add_ftrace_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigne
entry->ip = ip;
entry->direct = direct;
- __add_hash_entry(hash, entry);
+ add_ftrace_hash_entry(hash, entry);
return entry;
}
@@ -1249,6 +1248,25 @@ remove_hash_entry(struct ftrace_hash *hash,
hash->count--;
}
+void ftrace_hash_remove(struct ftrace_hash *hash)
+{
+ struct ftrace_func_entry *entry;
+ struct hlist_head *hhd;
+ struct hlist_node *tn;
+ int size;
+ int i;
+
+ if (!hash || !hash->count)
+ return;
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hhd = &hash->buckets[i];
+ hlist_for_each_entry_safe(entry, tn, hhd, hlist)
+ remove_hash_entry(hash, entry);
+ }
+ FTRACE_WARN_ON(hash->count);
+}
+
static void ftrace_hash_clear(struct ftrace_hash *hash)
{
struct hlist_head *hhd;
@@ -1458,7 +1476,7 @@ static struct ftrace_hash *__move_hash(struct ftrace_hash *src, int size)
hhd = &src->buckets[i];
hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
remove_hash_entry(src, entry);
- __add_hash_entry(new_hash, entry);
+ add_ftrace_hash_entry(new_hash, entry);
}
}
return new_hash;
@@ -5341,7 +5359,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
map->entry.ip = ip;
map->data = data;
- __add_hash_entry(&mapper->hash, &map->entry);
+ add_ftrace_hash_entry(&mapper->hash, &map->entry);
return 0;
}
@@ -6288,11 +6306,16 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
}
EXPORT_SYMBOL_GPL(modify_ftrace_direct);
-static unsigned long hash_count(struct ftrace_hash *hash)
+static inline unsigned long hash_count(struct ftrace_hash *hash)
{
return hash ? hash->count : 0;
}
+unsigned long ftrace_hash_count(struct ftrace_hash *hash)
+{
+ return hash_count(hash);
+}
+
/**
* hash_add - adds two struct ftrace_hash and returns the result
* @a: struct ftrace_hash object
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8ad72e17d8eb..e98ee7e1e66f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1371,33 +1371,33 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
-static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
+static int perf_call_bpf_enter(struct trace_event_call *call,
struct syscall_metadata *sys_data,
- struct syscall_trace_enter *rec)
+ int syscall_nr, unsigned long *args)
{
struct syscall_tp_t {
struct trace_entry ent;
int syscall_nr;
unsigned long args[SYSCALL_DEFINE_MAXARGS];
} __aligned(8) param;
+ struct pt_regs regs = {};
int i;
BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
- /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
- perf_fetch_caller_regs(regs);
- *(struct pt_regs **)&param = regs;
- param.syscall_nr = rec->nr;
+ /* bpf prog requires 'regs' to be the first member in the ctx */
+ perf_fetch_caller_regs(&regs);
+ *(struct pt_regs **)&param = &regs;
+ param.syscall_nr = syscall_nr;
for (i = 0; i < sys_data->nb_args; i++)
- param.args[i] = rec->args[i];
- return trace_call_bpf(call, &param);
+ param.args[i] = args[i];
+ return trace_call_bpf_faultable(call, &param);
}
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
- struct pt_regs *fake_regs;
struct hlist_head *head;
unsigned long args[6];
bool valid_prog_array;
@@ -1410,12 +1410,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
int size = 0;
int uargs = 0;
- /*
- * Syscall probe called with preemption enabled, but the ring
- * buffer and per-cpu data require preemption to be disabled.
- */
might_fault();
- guard(preempt_notrace)();
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
@@ -1429,6 +1424,26 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
syscall_get_arguments(current, regs, args);
+ /*
+ * Run BPF program in faultable context before per-cpu buffer
+ * allocation, allowing sleepable BPF programs to execute.
+ */
+ valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
+ if (valid_prog_array &&
+ !perf_call_bpf_enter(sys_data->enter_event, sys_data,
+ syscall_nr, args))
+ return;
+
+ /*
+ * Per-cpu ring buffer and perf event list operations require
+ * preemption to be disabled.
+ */
+ guard(preempt_notrace)();
+
+ head = this_cpu_ptr(sys_data->enter_event->perf_events);
+ if (hlist_empty(head))
+ return;
+
/* Check if this syscall event faults in user space memory */
mayfault = sys_data->user_mask != 0;
@@ -1438,17 +1453,12 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
return;
}
- head = this_cpu_ptr(sys_data->enter_event->perf_events);
- valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
- if (!valid_prog_array && hlist_empty(head))
- return;
-
/* get the size after alignment with the u32 buffer size field */
size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
size = ALIGN(size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
+ rec = perf_trace_buf_alloc(size, NULL, &rctx);
if (!rec)
return;
@@ -1458,13 +1468,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
if (mayfault)
syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs);
- if ((valid_prog_array &&
- !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
- hlist_empty(head)) {
- perf_swevent_put_recursion_context(rctx);
- return;
- }
-
perf_trace_buf_submit(rec, size, rctx,
sys_data->enter_event->event.type, 1, regs,
head, NULL);
@@ -1514,40 +1517,35 @@ static void perf_sysenter_disable(struct trace_event_call *call)
syscall_fault_buffer_disable();
}
-static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
- struct syscall_trace_exit *rec)
+static int perf_call_bpf_exit(struct trace_event_call *call,
+ int syscall_nr, long ret_val)
{
struct syscall_tp_t {
struct trace_entry ent;
int syscall_nr;
unsigned long ret;
} __aligned(8) param;
-
- /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
- perf_fetch_caller_regs(regs);
- *(struct pt_regs **)&param = regs;
- param.syscall_nr = rec->nr;
- param.ret = rec->ret;
- return trace_call_bpf(call, &param);
+ struct pt_regs regs = {};
+
+ /* bpf prog requires 'regs' to be the first member in the ctx */
+ perf_fetch_caller_regs(&regs);
+ *(struct pt_regs **)&param = &regs;
+ param.syscall_nr = syscall_nr;
+ param.ret = ret_val;
+ return trace_call_bpf_faultable(call, &param);
}
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
- struct pt_regs *fake_regs;
struct hlist_head *head;
bool valid_prog_array;
int syscall_nr;
int rctx;
int size;
- /*
- * Syscall probe called with preemption enabled, but the ring
- * buffer and per-cpu data require preemption to be disabled.
- */
might_fault();
- guard(preempt_notrace)();
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
@@ -1559,29 +1557,37 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
if (!sys_data)
return;
- head = this_cpu_ptr(sys_data->exit_event->perf_events);
+ /*
+ * Run BPF program in faultable context before per-cpu buffer
+ * allocation, allowing sleepable BPF programs to execute.
+ */
valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
- if (!valid_prog_array && hlist_empty(head))
+ if (valid_prog_array &&
+ !perf_call_bpf_exit(sys_data->exit_event, syscall_nr,
+ syscall_get_return_value(current, regs)))
+ return;
+
+ /*
+ * Per-cpu ring buffer and perf event list operations require
+ * preemption to be disabled.
+ */
+ guard(preempt_notrace)();
+
+ head = this_cpu_ptr(sys_data->exit_event->perf_events);
+ if (hlist_empty(head))
return;
/* We can probably do that at build time */
size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
+ rec = perf_trace_buf_alloc(size, NULL, &rctx);
if (!rec)
return;
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
- if ((valid_prog_array &&
- !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
- hlist_empty(head)) {
- perf_swevent_put_recursion_context(rctx);
- return;
- }
-
perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
1, regs, head, NULL);
}