diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-17 09:18:14 +0100 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-17 09:18:14 +0100 |
| commit | 9c87e61e3c5797277407ba5eae4eac8a52be3fa3 (patch) | |
| tree | e3f902cb5363b5b90ab74a4b7e26fafbc15aaeaf /kernel/trace | |
| parent | b85966adbf5de0668a815c6e3527f87e0c387fb4 (diff) | |
| parent | e4287bf34f97a88c7d9322f5bde828724c073a6b (diff) | |
Merge tag 'bpf-next-7.2' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Pull bpf updates from Alexei Starovoitov:
"Major changes:
- Recover from BPF arena page faults using a scratch page and add
ptep_try_set() for lockless empty-slot installs on x86 and arm64.
This allows BPF kfuncs to access arena pointers directly.
The 'arena_direct_access' stable branch was created for this work
and was pulled into sched-ext and bpf-next trees (Tejun Heo, Kumar
Kartikeya Dwivedi)
- Lift old restriction and support 6+ arguments in BPF programs and
kfuncs on x86 and arm64 (Yonghong Song, Puranjay Mohan)
Other features and fixes:
- Add 24-bit BTF vlen and reclaim unused bits in the BTF UAPI to ease
addition of new BTF kinds (Alan Maguire)
- Raise the maximum BPF call chain depth from 8 to 16 frames (Alexei
Starovoitov)
- Refactor object relationship tracking in the verifier and fix a
dynptr use-after-free bug (Amery Hung)
- Harden the signed program loader and reject exclusive maps as inner
maps (Daniel Borkmann)
- Replace the verifier min/max bounds fields with a circular number
(cnum) representation and improve 32->64 bit range refinements
(Eduard Zingerman)
- Introduce the arena library and runtime (libarena) with a buddy
allocator, rbtree and SPMC queue data structures, ASAN support and
a parallel test harness. Allow subprograms to return arena pointers
and switch to a BTF type-tag based __arena annotation (Emil
Tsalapatis)
- Cache build IDs in the sleepable stackmap path and avoid faultable
build ID reads under mm locks (Ihor Solodrai)
- Introduce the tracing_multi link to attach a single BPF program to
many kernel functions at once. Allow specifying the uprobe_multi
target via FD (Jiri Olsa)
- Extend the bpf_list family of kfuncs with bpf_list_add/del(), and
bpf_list_is_first/is_last/empty() (Kaitao Cheng)
- Extend the BPF syscall with common attributes support for
prog_load, btf_load and map_create (Leon Hwang)
- Wrap rhashtable as BPF map (Mykyta Yatsenko, Herbert Xu)
- Add sleepable support for tracepoint programs and fix deadlocks in
LRU map due to NMI reentry (Mykyta Yatsenko)
- Fix OOB access in bpf_flow_keys, fix nullness analysis of inner
arrays, enforce write checks for global subprograms (Nuoqi Gui)
- Report the maximum combined stack depth and print a breakdown of
instructions processed per subprogram (Paul Chaignon)
- Add an XDP load-balancer benchmark and arm64 JIT support for stack
arguments (Puranjay Mohan)
- Add kfuncs to traverse over wakeup_sources (Samuel Wu)
- Allow sleepable BPF programs to use LPM trie maps directly (Vlad
Poenaru)
- Many more fixes and cleanups across the verifier, BTF, sockmap,
devmap, bpffs, security hooks, s390/riscv/loongarch JITs,
rqspinlock, libbpf, bpftool, selftests"
* tag 'bpf-next-7.2' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (336 commits)
selftests/bpf: Work around llvm stack overflow in crypto progs
selftests/bpf: add test for bpf_msg_pop_data() overflow
bpf, sockmap: fix integer overflow in bpf_msg_pop_data() bounds check
sockmap: Fix use-after-free in udp_bpf_recvmsg()
bpf, sockmap: keep sk_msg copy state in sync
bpf, sockmap: Fix wrong rsge offset in bpf_msg_push_data()
bpf, sockmap: reject overflowing copy + len in bpf_msg_push_data()
selftsets/bpf: Retry map update on helper_fill_hashmap()
selftests/bpf: Add test for sleepable lsm_cgroup rejection
selftests/bpf: Add test to verify the fix for bpf_setsockopt() helper
bpf: Fix bpf_get/setsockopt to tos for ipv4-mapped ipv6 socket
selftests/bpf: Avoid static LLVM linking for cross builds
selftests/bpf: Use common CFLAGS for urandom_read
selftests/bpf: Initialize operation name before use
tools/bpf: build: Append extra cflags
libbpf: Initialize CFLAGS before including Makefile.include
bpftool: Append extra host flags
bpftool: Avoid adding EXTRA_CFLAGS to HOST_CFLAGS
bpftool: Pass host flags to bootstrap libbpf
selftests/bpf: correct CONFIG_PPC64 macro name in comment
...
Diffstat (limited to 'kernel/trace')
| -rw-r--r-- | kernel/trace/bpf_trace.c | 334 | ||||
| -rw-r--r-- | kernel/trace/ftrace.c | 35 | ||||
| -rw-r--r-- | kernel/trace/trace_syscalls.c | 110 |
3 files changed, 393 insertions, 86 deletions
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a02bd258677e..82f8feea6931 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -23,6 +23,7 @@ #include <linux/sort.h> #include <linux/key.h> #include <linux/namei.h> +#include <linux/file.h> #include <net/bpf_sk_storage.h> @@ -42,6 +43,7 @@ #define MAX_UPROBE_MULTI_CNT (1U << 20) #define MAX_KPROBE_MULTI_CNT (1U << 20) +#define MAX_TRACING_MULTI_CNT (1U << 20) #ifdef CONFIG_MODULES struct bpf_trace_module { @@ -152,6 +154,34 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) return ret; } +/** + * trace_call_bpf_faultable - invoke BPF program in faultable context + * @call: tracepoint event + * @ctx: opaque context pointer + * + * Variant of trace_call_bpf() for faultable tracepoints (syscall + * tracepoints). Supports sleepable BPF programs by using rcu_tasks_trace + * for lifetime protection and bpf_prog_run_array_sleepable() for per-program + * RCU flavor selection, following the uprobe pattern. + * + * Per-program recursion protection is provided by + * bpf_prog_run_array_sleepable(). Global bpf_prog_active is not + * needed because syscall tracepoints cannot self-recurse. + * + * Must be called from a faultable/preemptible context. + */ +unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx) +{ + struct bpf_prog_array *prog_array; + + might_fault(); + guard(rcu_tasks_trace)(); + + prog_array = rcu_dereference_check(call->prog_array, + rcu_read_lock_trace_held()); + return bpf_prog_run_array_sleepable(prog_array, ctx, bpf_prog_run); +} + #ifdef CONFIG_BPF_KPROBE_OVERRIDE BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) { @@ -1305,7 +1335,8 @@ static inline bool is_uprobe_session(const struct bpf_prog *prog) static inline bool is_trace_fsession(const struct bpf_prog *prog) { return prog->type == BPF_PROG_TYPE_TRACING && - prog->expected_attach_type == BPF_TRACE_FSESSION; + (prog->expected_attach_type == BPF_TRACE_FSESSION || + prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI); } static const struct bpf_func_proto * @@ -2072,11 +2103,19 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) static __always_inline void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) { + struct srcu_ctr __percpu *scp = NULL; struct bpf_prog *prog = link->link.prog; + bool sleepable = prog->sleepable; struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; - rcu_read_lock_dont_migrate(); + if (sleepable) { + scp = rcu_read_lock_tasks_trace(); + migrate_disable(); + } else { + rcu_read_lock_dont_migrate(); + } + if (unlikely(!bpf_prog_get_recursion_context(prog))) { bpf_prog_inc_misses_counter(prog); goto out; @@ -2085,12 +2124,18 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) run_ctx.bpf_cookie = link->cookie; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - (void) bpf_prog_run(prog, args); + (void)bpf_prog_run(prog, args); bpf_reset_run_ctx(old_run_ctx); out: bpf_prog_put_recursion_context(prog); - rcu_read_unlock_migrate(); + + if (sleepable) { + migrate_enable(); + rcu_read_unlock_tasks_trace(scp); + } else { + rcu_read_unlock_migrate(); + } } #define UNPACK(...) __VA_ARGS__ @@ -3170,6 +3215,38 @@ static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx) return run_ctx->uprobe->cookie; } +static int bpf_uprobe_multi_get_path(const union bpf_attr *attr, struct path *path) +{ + void __user *upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path); + u32 path_fd = attr->link_create.uprobe_multi.path_fd; + u32 flags = attr->link_create.uprobe_multi.flags; + + if (flags & BPF_F_UPROBE_MULTI_PATH_FD) { + /* + * When BPF_F_UPROBE_MULTI_PATH_FD is set, the executable is + * identified by path_fd, upath must be NULL. + */ + if (upath) + return -EINVAL; + + CLASS(fd, f)(path_fd); + if (fd_empty(f)) + return -EBADF; + *path = fd_file(f)->f_path; + path_get(path); + return 0; + } + + /* + * When BPF_F_UPROBE_MULTI_PATH_FD is not set, the path is resolved + * relative to the cwd (AT_FDCWD) or absolute using the upath string. + */ + if (!upath || path_fd) + return -EINVAL; + + return user_path_at(AT_FDCWD, upath, LOOKUP_FOLLOW, path); +} + int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_uprobe_multi_link *link = NULL; @@ -3179,10 +3256,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr struct task_struct *task = NULL; unsigned long __user *uoffsets; u64 __user *ucookies; - void __user *upath; + unsigned long size; u32 flags, cnt, i; struct path path; - char *name; pid_t pid; int err; @@ -3197,19 +3273,18 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr return -EINVAL; flags = attr->link_create.uprobe_multi.flags; - if (flags & ~BPF_F_UPROBE_MULTI_RETURN) + if (flags & ~(BPF_F_UPROBE_MULTI_RETURN | BPF_F_UPROBE_MULTI_PATH_FD)) return -EINVAL; /* - * path, offsets and cnt are mandatory, + * offsets and cnt are mandatory, * ref_ctr_offsets and cookies are optional */ - upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path); uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets); cnt = attr->link_create.uprobe_multi.cnt; pid = attr->link_create.uprobe_multi.pid; - if (!upath || !uoffsets || !cnt || pid < 0) + if (!uoffsets || !cnt || pid < 0) return -EINVAL; if (cnt > MAX_UPROBE_MULTI_CNT) return -E2BIG; @@ -3217,14 +3292,17 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets); ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies); - name = strndup_user(upath, PATH_MAX); - if (IS_ERR(name)) { - err = PTR_ERR(name); - return err; - } + /* + * All uoffsets/uref_ctr_offsets/ucookies arrays have the same value + * size, we need to check their address range is safe for __get_user + * calls. + */ + size = sizeof(*uoffsets) * cnt; + if (!access_ok(uoffsets, size) || !access_ok(uref_ctr_offsets, size) || + !access_ok(ucookies, size)) + return -EFAULT; - err = kern_path(name, LOOKUP_FOLLOW, &path); - kfree(name); + err = bpf_uprobe_multi_get_path(attr, &path); if (err) return err; @@ -3398,12 +3476,12 @@ typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struc * direct calls into all the specific callback implementations * (copy_user_data_sleepable, copy_user_data_nofault, and so on) */ -static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size, +static __always_inline int __bpf_dynptr_copy_str(const struct bpf_dynptr *dptr, u64 doff, u64 size, const void *unsafe_src, copy_fn_t str_copy_fn, struct task_struct *tsk) { - struct bpf_dynptr_kern *dst; + const struct bpf_dynptr_kern *dst; u64 chunk_sz, off; void *dst_slice; int cnt, err; @@ -3439,7 +3517,7 @@ static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64 u64 size, const void *unsafe_src, copy_fn_t copy_fn, struct task_struct *tsk) { - struct bpf_dynptr_kern *dst; + const struct bpf_dynptr_kern *dst; void *dst_slice; char buf[256]; u64 off, chunk_sz; @@ -3540,49 +3618,49 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid return bpf_send_signal_common(sig, type, task, value); } -__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_probe_read_user_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_data_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_probe_read_kernel_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign, copy_kernel_data_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_probe_read_user_str_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_str_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign, copy_kernel_str_nofault, NULL); } -__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_copy_from_user_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_data_sleepable, NULL); } -__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_copy_from_user_str_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_str_sleepable, NULL); } -__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_copy_from_user_task_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { @@ -3590,7 +3668,7 @@ __bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off, copy_user_data_sleepable, tsk); } -__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { @@ -3599,3 +3677,203 @@ __bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 } __bpf_kfunc_end_defs(); + +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) && \ + defined(CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS) + +static void bpf_tracing_multi_link_release(struct bpf_link *link) +{ + struct bpf_tracing_multi_link *tr_link = + container_of(link, struct bpf_tracing_multi_link, link); + + WARN_ON_ONCE(bpf_trampoline_multi_detach(link->prog, tr_link)); +} + +static void bpf_tracing_multi_link_dealloc(struct bpf_link *link) +{ + struct bpf_tracing_multi_link *tr_link = + container_of(link, struct bpf_tracing_multi_link, link); + + kvfree(tr_link->fexits); + kvfree(tr_link->cookies); + kvfree(tr_link); +} + +#ifdef CONFIG_PROC_FS +static void bpf_tracing_multi_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_tracing_multi_link *tr_link = + container_of(link, struct bpf_tracing_multi_link, link); + bool has_cookies = !!tr_link->cookies; + + seq_printf(seq, "attach_type:\t%u\n", tr_link->link.attach_type); + seq_printf(seq, "cnt:\t%u\n", tr_link->nodes_cnt); + + seq_printf(seq, "%s\t %s\t %s\t %s\n", "obj-id", "btf-id", "cookie", "func"); + for (int i = 0; i < tr_link->nodes_cnt; i++) { + struct bpf_tracing_multi_node *mnode = &tr_link->nodes[i]; + u32 btf_id, obj_id; + + bpf_trampoline_unpack_key(mnode->trampoline->key, &obj_id, &btf_id); + seq_printf(seq, "%u\t %u\t %llu\t %pS\n", + obj_id, btf_id, + has_cookies ? tr_link->cookies[i] : 0, + (void *) mnode->trampoline->ip); + + cond_resched(); + } +} +#endif + +static const struct bpf_link_ops bpf_tracing_multi_link_lops = { + .release = bpf_tracing_multi_link_release, + .dealloc_deferred = bpf_tracing_multi_link_dealloc, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_tracing_multi_show_fdinfo, +#endif +}; + +static int ids_cmp_r(const void *pa, const void *pb, const void *priv __maybe_unused) +{ + u32 a = *(u32 *) pa; + u32 b = *(u32 *) pb; + + return (a > b) - (a < b); +} + +static void ids_swap_r(void *a, void *b, int size __maybe_unused, + const void *priv __maybe_unused) +{ + u64 *cookie_a, *cookie_b, *cookies; + u32 *id_a = a, *id_b = b, *ids; + void **data = (void **) priv; + + ids = data[0]; + cookies = data[1]; + + if (cookies) { + cookie_a = cookies + (id_a - ids); + cookie_b = cookies + (id_b - ids); + swap(*cookie_a, *cookie_b); + } + swap(*id_a, *id_b); +} + +static int check_dup_ids(u32 *ids, u64 *cookies, u32 cnt) +{ + void *data[2] = { ids, cookies }; + int err = 0; + + /* + * Sort ids array (together with cookies array if defined) + * and check it for duplicates. The ids and cookies arrays + * are left sorted. + */ + sort_r_nonatomic(ids, cnt, sizeof(ids[0]), ids_cmp_r, ids_swap_r, data); + + for (int i = 1; i < cnt; i++) { + if (ids[i] == ids[i - 1]) { + err = -EINVAL; + break; + } + } + return err; +} + +int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) +{ + struct bpf_tracing_multi_link *link = NULL; + struct bpf_tramp_node *fexits = NULL; + struct bpf_link_primer link_primer; + u32 cnt, *ids = NULL; + u64 __user *ucookies; + u64 *cookies = NULL; + u32 __user *uids; + int err; + + uids = u64_to_user_ptr(attr->link_create.tracing_multi.ids); + cnt = attr->link_create.tracing_multi.cnt; + + if (!cnt || !uids) + return -EINVAL; + if (cnt > MAX_TRACING_MULTI_CNT) + return -E2BIG; + if (attr->link_create.flags || attr->link_create.target_fd) + return -EINVAL; + + ids = kvmalloc_objs(*ids, cnt); + if (!ids) + return -ENOMEM; + + if (copy_from_user(ids, uids, cnt * sizeof(*ids))) { + err = -EFAULT; + goto error; + } + + ucookies = u64_to_user_ptr(attr->link_create.tracing_multi.cookies); + if (ucookies) { + cookies = kvmalloc_objs(*cookies, cnt); + if (!cookies) { + err = -ENOMEM; + goto error; + } + if (copy_from_user(cookies, ucookies, cnt * sizeof(*cookies))) { + err = -EFAULT; + goto error; + } + } + + err = check_dup_ids(ids, cookies, cnt); + if (err) + goto error; + + if (prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) { + fexits = kvmalloc_objs(*fexits, cnt); + if (!fexits) { + err = -ENOMEM; + goto error; + } + } + + link = kvzalloc_flex(*link, nodes, cnt); + if (!link) { + err = -ENOMEM; + goto error; + } + + bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING_MULTI, + &bpf_tracing_multi_link_lops, prog, prog->expected_attach_type); + + err = bpf_link_prime(&link->link, &link_primer); + if (err) + goto error; + + link->nodes_cnt = cnt; + link->cookies = cookies; + link->fexits = fexits; + + err = bpf_trampoline_multi_attach(prog, ids, link); + kvfree(ids); + if (err) { + bpf_link_cleanup(&link_primer); + return err; + } + return bpf_link_settle(&link_primer); + +error: + kvfree(fexits); + kvfree(cookies); + kvfree(ids); + kvfree(link); + return err; +} + +#else + +int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) +{ + return -EOPNOTSUPP; +} + +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS && CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b2611de3f594..f93e34dd2328 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1198,8 +1198,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) return __ftrace_lookup_ip(hash, ip); } -static void __add_hash_entry(struct ftrace_hash *hash, - struct ftrace_func_entry *entry) +void add_ftrace_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry) { struct hlist_head *hhd; unsigned long key; @@ -1221,7 +1220,7 @@ add_ftrace_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigne entry->ip = ip; entry->direct = direct; - __add_hash_entry(hash, entry); + add_ftrace_hash_entry(hash, entry); return entry; } @@ -1249,6 +1248,25 @@ remove_hash_entry(struct ftrace_hash *hash, hash->count--; } +void ftrace_hash_remove(struct ftrace_hash *hash) +{ + struct ftrace_func_entry *entry; + struct hlist_head *hhd; + struct hlist_node *tn; + int size; + int i; + + if (!hash || !hash->count) + return; + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hhd = &hash->buckets[i]; + hlist_for_each_entry_safe(entry, tn, hhd, hlist) + remove_hash_entry(hash, entry); + } + FTRACE_WARN_ON(hash->count); +} + static void ftrace_hash_clear(struct ftrace_hash *hash) { struct hlist_head *hhd; @@ -1458,7 +1476,7 @@ static struct ftrace_hash *__move_hash(struct ftrace_hash *src, int size) hhd = &src->buckets[i]; hlist_for_each_entry_safe(entry, tn, hhd, hlist) { remove_hash_entry(src, entry); - __add_hash_entry(new_hash, entry); + add_ftrace_hash_entry(new_hash, entry); } } return new_hash; @@ -5341,7 +5359,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, map->entry.ip = ip; map->data = data; - __add_hash_entry(&mapper->hash, &map->entry); + add_ftrace_hash_entry(&mapper->hash, &map->entry); return 0; } @@ -6288,11 +6306,16 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) } EXPORT_SYMBOL_GPL(modify_ftrace_direct); -static unsigned long hash_count(struct ftrace_hash *hash) +static inline unsigned long hash_count(struct ftrace_hash *hash) { return hash ? hash->count : 0; } +unsigned long ftrace_hash_count(struct ftrace_hash *hash) +{ + return hash_count(hash); +} + /** * hash_add - adds two struct ftrace_hash and returns the result * @a: struct ftrace_hash object diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8ad72e17d8eb..e98ee7e1e66f 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1371,33 +1371,33 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); static int sys_perf_refcount_enter; static int sys_perf_refcount_exit; -static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs, +static int perf_call_bpf_enter(struct trace_event_call *call, struct syscall_metadata *sys_data, - struct syscall_trace_enter *rec) + int syscall_nr, unsigned long *args) { struct syscall_tp_t { struct trace_entry ent; int syscall_nr; unsigned long args[SYSCALL_DEFINE_MAXARGS]; } __aligned(8) param; + struct pt_regs regs = {}; int i; BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *)); - /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ - perf_fetch_caller_regs(regs); - *(struct pt_regs **)¶m = regs; - param.syscall_nr = rec->nr; + /* bpf prog requires 'regs' to be the first member in the ctx */ + perf_fetch_caller_regs(®s); + *(struct pt_regs **)¶m = ®s; + param.syscall_nr = syscall_nr; for (i = 0; i < sys_data->nb_args; i++) - param.args[i] = rec->args[i]; - return trace_call_bpf(call, ¶m); + param.args[i] = args[i]; + return trace_call_bpf_faultable(call, ¶m); } static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; - struct pt_regs *fake_regs; struct hlist_head *head; unsigned long args[6]; bool valid_prog_array; @@ -1410,12 +1410,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int size = 0; int uargs = 0; - /* - * Syscall probe called with preemption enabled, but the ring - * buffer and per-cpu data require preemption to be disabled. - */ might_fault(); - guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) @@ -1429,6 +1424,26 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) syscall_get_arguments(current, regs, args); + /* + * Run BPF program in faultable context before per-cpu buffer + * allocation, allowing sleepable BPF programs to execute. + */ + valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); + if (valid_prog_array && + !perf_call_bpf_enter(sys_data->enter_event, sys_data, + syscall_nr, args)) + return; + + /* + * Per-cpu ring buffer and perf event list operations require + * preemption to be disabled. + */ + guard(preempt_notrace)(); + + head = this_cpu_ptr(sys_data->enter_event->perf_events); + if (hlist_empty(head)) + return; + /* Check if this syscall event faults in user space memory */ mayfault = sys_data->user_mask != 0; @@ -1438,17 +1453,12 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) return; } - head = this_cpu_ptr(sys_data->enter_event->perf_events); - valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); - if (!valid_prog_array && hlist_empty(head)) - return; - /* get the size after alignment with the u32 buffer size field */ size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); + rec = perf_trace_buf_alloc(size, NULL, &rctx); if (!rec) return; @@ -1458,13 +1468,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (mayfault) syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs); - if ((valid_prog_array && - !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) || - hlist_empty(head)) { - perf_swevent_put_recursion_context(rctx); - return; - } - perf_trace_buf_submit(rec, size, rctx, sys_data->enter_event->event.type, 1, regs, head, NULL); @@ -1514,40 +1517,35 @@ static void perf_sysenter_disable(struct trace_event_call *call) syscall_fault_buffer_disable(); } -static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs, - struct syscall_trace_exit *rec) +static int perf_call_bpf_exit(struct trace_event_call *call, + int syscall_nr, long ret_val) { struct syscall_tp_t { struct trace_entry ent; int syscall_nr; unsigned long ret; } __aligned(8) param; - - /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ - perf_fetch_caller_regs(regs); - *(struct pt_regs **)¶m = regs; - param.syscall_nr = rec->nr; - param.ret = rec->ret; - return trace_call_bpf(call, ¶m); + struct pt_regs regs = {}; + + /* bpf prog requires 'regs' to be the first member in the ctx */ + perf_fetch_caller_regs(®s); + *(struct pt_regs **)¶m = ®s; + param.syscall_nr = syscall_nr; + param.ret = ret_val; + return trace_call_bpf_faultable(call, ¶m); } static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; - struct pt_regs *fake_regs; struct hlist_head *head; bool valid_prog_array; int syscall_nr; int rctx; int size; - /* - * Syscall probe called with preemption enabled, but the ring - * buffer and per-cpu data require preemption to be disabled. - */ might_fault(); - guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) @@ -1559,29 +1557,37 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) if (!sys_data) return; - head = this_cpu_ptr(sys_data->exit_event->perf_events); + /* + * Run BPF program in faultable context before per-cpu buffer + * allocation, allowing sleepable BPF programs to execute. + */ valid_prog_array = bpf_prog_array_valid(sys_data->exit_event); - if (!valid_prog_array && hlist_empty(head)) + if (valid_prog_array && + !perf_call_bpf_exit(sys_data->exit_event, syscall_nr, + syscall_get_return_value(current, regs))) + return; + + /* + * Per-cpu ring buffer and perf event list operations require + * preemption to be disabled. + */ + guard(preempt_notrace)(); + + head = this_cpu_ptr(sys_data->exit_event->perf_events); + if (hlist_empty(head)) return; /* We can probably do that at build time */ size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); + rec = perf_trace_buf_alloc(size, NULL, &rctx); if (!rec) return; rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - if ((valid_prog_array && - !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) || - hlist_empty(head)) { - perf_swevent_put_recursion_context(rctx); - return; - } - perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, 1, regs, head, NULL); } |
