aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexei Starovoitov <ast@kernel.org>2023-09-16 09:34:23 -0700
committerAlexei Starovoitov <ast@kernel.org>2023-09-16 09:36:44 -0700
commitec6f1b4db95b7eedb3fe85f4f14e08fa0e9281c3 (patch)
tree2094312e53ff2b27e6aa2cb71fd29cc5b41175c4
parentc4ab64e6da42030c866f0589d1bfc13a037432dd (diff)
parentd2a93715bfb0655a63bb1687f43f48eb2e61717b (diff)
Merge branch 'exceptions-1-2'
Kumar Kartikeya Dwivedi says: ==================== Exceptions - 1/2 This series implements the _first_ part of the runtime and verifier support needed to enable BPF exceptions. Exceptions thrown from programs are processed as an immediate exit from the program, which unwinds all the active stack frames until the main stack frame, and returns to the BPF program's caller. The ability to perform this unwinding safely allows the program to test conditions that are always true at runtime but which the verifier has no visibility into. Thus, it also reduces verification effort by safely terminating redundant paths that can be taken within a program. The patches to perform runtime resource cleanup during the frame-by-frame unwinding will be posted as a follow-up to this set. It must be noted that exceptions are not an error handling mechanism for unlikely runtime conditions, but a way to safely terminate the execution of a program in presence of conditions that should never occur at runtime. They are meant to serve higher-level primitives such as program assertions. The following kfuncs and macros are introduced: Assertion macros are also introduced, please see patch 13 for their documentation. /* Description * Throw a BPF exception from the program, immediately terminating its * execution and unwinding the stack. The supplied 'cookie' parameter * will be the return value of the program when an exception is thrown, * and the default exception callback is used. Otherwise, if an exception * callback is set using the '__exception_cb(callback)' declaration tag * on the main program, the 'cookie' parameter will be the callback's only * input argument. * * Thus, in case of default exception callback, 'cookie' is subjected to * constraints on the program's return value (as with R0 on exit). * Otherwise, the return value of the marked exception callback will be * subjected to the same checks. * * Note that throwing an exception with lingering resources (locks, * references, etc.) will lead to a verification error. * * Note that callbacks *cannot* call this helper. * Returns * Never. * Throws * An exception with the specified 'cookie' value. */ extern void bpf_throw(u64 cookie) __ksym; /* This macro must be used to mark the exception callback corresponding to the * main program. For example: * * int exception_cb(u64 cookie) { * return cookie; * } * * SEC("tc") * __exception_cb(exception_cb) * int main_prog(struct __sk_buff *ctx) { * ... * return TC_ACT_OK; * } * * Here, exception callback for the main program will be 'exception_cb'. Note * that this attribute can only be used once, and multiple exception callbacks * specified for the main program will lead to verification error. */ \#define __exception_cb(name) __attribute__((btf_decl_tag("exception_callback:" #name))) As such, a program can only install an exception handler once for the lifetime of a BPF program, and this handler cannot be changed at runtime. The purpose of the handler is to simply interpret the cookie value supplied by the bpf_throw call, and execute user-defined logic corresponding to it. The primary purpose of allowing a handler is to control the return value of the program. The default handler returns the cookie value passed to bpf_throw when an exception is thrown. Fixing the handler for the lifetime of the program eliminates tricky and expensive handling in case of runtime changes of the handler callback when programs begin to nest, where it becomes more complex to save and restore the active handler at runtime. This version of offline unwinding based BPF exceptions is truly zero overhead, with the exception of generation of a default callback which contains a few instructions to return a default return value (0) when no exception callback is supplied by the user. Callbacks are disallowed from throwing BPF exceptions for now, since such exceptions need to cross the callback helper boundary (and therefore must care about unwinding kernel state), however it is possible to lift this restriction in the future follow-up. Exceptions terminate propogating at program boundaries, hence both BPF_PROG_TYPE_EXT and tail call targets return to their caller context the return value of the exception callback, in the event that they throw an exception. Thus, exceptions do not cross extension or tail call boundary. However, this is mostly an implementation choice, and can be changed to suit more user-friendly semantics. Changelog: ---------- v2 -> v3 v2: https://lore.kernel.org/bpf/20230809114116.3216687-1-memxor@gmail.com * Add Dave's Acked-by. * Address all comments from Alexei. * Use bpf_is_subprog to check for main prog in bpf_stack_walker. * Drop accidental leftover hunk in libbpf patch. * Split libbpf patch's refactoring to aid review * Disable fentry/fexit in addition to freplace for exception cb. * Add selftests for fentry/fexit/freplace on exception cb and main prog. * Use btf_find_by_name_kind in bpf_find_exception_callback_insn_off (Martin) * Split KASAN patch into two to aid backporting (Andrey) * Move exception callback append step to bpf_object__reloacte (Andrii) * Ensure that the exception callback name is unique (Andrii) * Keep ASM implementation of assertion macros instead of C, as it does not achieve intended results for bpf_assert_range and other cases. v1 -> v2 v1: https://lore.kernel.org/bpf/20230713023232.1411523-1-memxor@gmail.com * Address all comments from Alexei. * Fix a few bugs and corner cases in the implementations found during testing. Also add new selftests for these cases. * Reinstate patch to consider ksym.end part of the program (but reworked to cover other corner cases). * Implement new style of tagging exception callbacks, add libbpf support for the new declaration tag. * Limit support to 64-bit integer types for assertion macros. The compiler ends up performing shifts or bitwise and operations when finally making use of the value, which defeats the purpose of the macro. On noalu32 mode, the shifts may also happen before use, hurting reliability. * Comprehensively test assertion macros and their side effects on the verifier state, register bounds, etc. * Fix a KASAN false positive warning. RFC v1 -> v1 RFC v1: https://lore.kernel.org/bpf/20230405004239.1375399-1-memxor@gmail.com * Completely rework the unwinding infrastructure to use offline unwinding support. * Remove the runtime exception state and program rewriting code. * Make bpf_set_exception_callback idempotent to avoid vexing synchronization and state clobbering issues in presence of program nesting. * Disable bpf_throw within callback functions, for now. * Allow bpf_throw in tail call programs and extension programs, removing limitations of rewrite based unwinding. * Expand selftests. ==================== Link: https://lore.kernel.org/r/20230912233214.1518551-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
-rw-r--r--arch/arm64/net/bpf_jit_comp.c2
-rw-r--r--arch/s390/net/bpf_jit_comp.c2
-rw-r--r--arch/x86/net/bpf_jit_comp.c117
-rw-r--r--include/linux/bpf.h13
-rw-r--r--include/linux/bpf_verifier.h8
-rw-r--r--include/linux/filter.h8
-rw-r--r--include/linux/kasan.h2
-rw-r--r--kernel/bpf/btf.c29
-rw-r--r--kernel/bpf/core.c29
-rw-r--r--kernel/bpf/helpers.c45
-rw-r--r--kernel/bpf/syscall.c2
-rw-r--r--kernel/bpf/verifier.c451
-rw-r--r--mm/kasan/kasan.h1
-rw-r--r--tools/lib/bpf/libbpf.c166
-rw-r--r--tools/testing/selftests/bpf/DENYLIST.aarch641
-rw-r--r--tools/testing/selftests/bpf/DENYLIST.s390x1
-rw-r--r--tools/testing/selftests/bpf/bpf_experimental.h288
-rw-r--r--tools/testing/selftests/bpf/prog_tests/exceptions.c408
-rw-r--r--tools/testing/selftests/bpf/progs/exceptions.c368
-rw-r--r--tools/testing/selftests/bpf/progs/exceptions_assert.c135
-rw-r--r--tools/testing/selftests/bpf/progs/exceptions_ext.c72
-rw-r--r--tools/testing/selftests/bpf/progs/exceptions_fail.c347
22 files changed, 2371 insertions, 124 deletions
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 150d1c6543f7..7d4af64e3982 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -288,7 +288,7 @@ static bool is_lsi_offset(int offset, int scale)
static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
{
const struct bpf_prog *prog = ctx->prog;
- const bool is_main_prog = prog->aux->func_idx == 0;
+ const bool is_main_prog = !bpf_is_subprog(prog);
const u8 r6 = bpf2a64[BPF_REG_6];
const u8 r7 = bpf2a64[BPF_REG_7];
const u8 r8 = bpf2a64[BPF_REG_8];
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index de2fb12120d2..eeb42e5cd7d6 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -556,7 +556,7 @@ static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT6_PCREL_RILC(0xc0040000, 0, jit->prologue_plt);
jit->prologue_plt_ret = jit->prg;
- if (fp->aux->func_idx == 0) {
+ if (!bpf_is_subprog(fp)) {
/* Initialize the tail call counter in the main program. */
/* xc STK_OFF_TCCNT(4,%r15),STK_OFF_TCCNT(%r15) */
_EMIT6(0xd703f000 | STK_OFF_TCCNT, 0xf000 | STK_OFF_TCCNT);
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 2846c21d75bf..84005f2114e0 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -16,6 +16,9 @@
#include <asm/set_memory.h>
#include <asm/nospec-branch.h>
#include <asm/text-patching.h>
+#include <asm/unwind.h>
+
+static bool all_callee_regs_used[4] = {true, true, true, true};
static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
{
@@ -255,6 +258,14 @@ struct jit_context {
/* Number of bytes that will be skipped on tailcall */
#define X86_TAIL_CALL_OFFSET (11 + ENDBR_INSN_SIZE)
+static void push_r12(u8 **pprog)
+{
+ u8 *prog = *pprog;
+
+ EMIT2(0x41, 0x54); /* push r12 */
+ *pprog = prog;
+}
+
static void push_callee_regs(u8 **pprog, bool *callee_regs_used)
{
u8 *prog = *pprog;
@@ -270,6 +281,14 @@ static void push_callee_regs(u8 **pprog, bool *callee_regs_used)
*pprog = prog;
}
+static void pop_r12(u8 **pprog)
+{
+ u8 *prog = *pprog;
+
+ EMIT2(0x41, 0x5C); /* pop r12 */
+ *pprog = prog;
+}
+
static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
{
u8 *prog = *pprog;
@@ -291,7 +310,8 @@ static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
* while jumping to another program
*/
static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
- bool tail_call_reachable, bool is_subprog)
+ bool tail_call_reachable, bool is_subprog,
+ bool is_exception_cb)
{
u8 *prog = *pprog;
@@ -311,8 +331,22 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
/* Keep the same instruction layout. */
EMIT2(0x66, 0x90); /* nop2 */
}
- EMIT1(0x55); /* push rbp */
- EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
+ /* Exception callback receives FP as third parameter */
+ if (is_exception_cb) {
+ EMIT3(0x48, 0x89, 0xF4); /* mov rsp, rsi */
+ EMIT3(0x48, 0x89, 0xD5); /* mov rbp, rdx */
+ /* The main frame must have exception_boundary as true, so we
+ * first restore those callee-saved regs from stack, before
+ * reusing the stack frame.
+ */
+ pop_callee_regs(&prog, all_callee_regs_used);
+ pop_r12(&prog);
+ /* Reset the stack frame. */
+ EMIT3(0x48, 0x89, 0xEC); /* mov rsp, rbp */
+ } else {
+ EMIT1(0x55); /* push rbp */
+ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
+ }
/* X86_TAIL_CALL_OFFSET is here */
EMIT_ENDBR();
@@ -471,7 +505,8 @@ static void emit_return(u8 **pprog, u8 *ip)
* goto *(prog->bpf_func + prologue_size);
* out:
*/
-static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
+static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
+ u8 **pprog, bool *callee_regs_used,
u32 stack_depth, u8 *ip,
struct jit_context *ctx)
{
@@ -521,7 +556,12 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
offset = ctx->tail_call_indirect_label - (prog + 2 - start);
EMIT2(X86_JE, offset); /* je out */
- pop_callee_regs(&prog, callee_regs_used);
+ if (bpf_prog->aux->exception_boundary) {
+ pop_callee_regs(&prog, all_callee_regs_used);
+ pop_r12(&prog);
+ } else {
+ pop_callee_regs(&prog, callee_regs_used);
+ }
EMIT1(0x58); /* pop rax */
if (stack_depth)
@@ -545,7 +585,8 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
*pprog = prog;
}
-static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
+static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
+ struct bpf_jit_poke_descriptor *poke,
u8 **pprog, u8 *ip,
bool *callee_regs_used, u32 stack_depth,
struct jit_context *ctx)
@@ -574,7 +615,13 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE,
poke->tailcall_bypass);
- pop_callee_regs(&prog, callee_regs_used);
+ if (bpf_prog->aux->exception_boundary) {
+ pop_callee_regs(&prog, all_callee_regs_used);
+ pop_r12(&prog);
+ } else {
+ pop_callee_regs(&prog, callee_regs_used);
+ }
+
EMIT1(0x58); /* pop rax */
if (stack_depth)
EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8));
@@ -1049,8 +1096,20 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
emit_prologue(&prog, bpf_prog->aux->stack_depth,
bpf_prog_was_classic(bpf_prog), tail_call_reachable,
- bpf_prog->aux->func_idx != 0);
- push_callee_regs(&prog, callee_regs_used);
+ bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb);
+ /* Exception callback will clobber callee regs for its own use, and
+ * restore the original callee regs from main prog's stack frame.
+ */
+ if (bpf_prog->aux->exception_boundary) {
+ /* We also need to save r12, which is not mapped to any BPF
+ * register, as we throw after entry into the kernel, which may
+ * overwrite r12.
+ */
+ push_r12(&prog);
+ push_callee_regs(&prog, all_callee_regs_used);
+ } else {
+ push_callee_regs(&prog, callee_regs_used);
+ }
ilen = prog - temp;
if (rw_image)
@@ -1647,13 +1706,15 @@ st: if (is_imm8(insn->off))
case BPF_JMP | BPF_TAIL_CALL:
if (imm32)
- emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1],
+ emit_bpf_tail_call_direct(bpf_prog,
+ &bpf_prog->aux->poke_tab[imm32 - 1],
&prog, image + addrs[i - 1],
callee_regs_used,
bpf_prog->aux->stack_depth,
ctx);
else
- emit_bpf_tail_call_indirect(&prog,
+ emit_bpf_tail_call_indirect(bpf_prog,
+ &prog,
callee_regs_used,
bpf_prog->aux->stack_depth,
image + addrs[i - 1],
@@ -1906,7 +1967,12 @@ emit_jmp:
seen_exit = true;
/* Update cleanup_addr */
ctx->cleanup_addr = proglen;
- pop_callee_regs(&prog, callee_regs_used);
+ if (bpf_prog->aux->exception_boundary) {
+ pop_callee_regs(&prog, all_callee_regs_used);
+ pop_r12(&prog);
+ } else {
+ pop_callee_regs(&prog, callee_regs_used);
+ }
EMIT1(0xC9); /* leave */
emit_return(&prog, image + addrs[i - 1] + (prog - temp));
break;
@@ -2933,3 +2999,30 @@ void bpf_jit_free(struct bpf_prog *prog)
bpf_prog_unlock_free(prog);
}
+
+bool bpf_jit_supports_exceptions(void)
+{
+ /* We unwind through both kernel frames (starting from within bpf_throw
+ * call) and BPF frames. Therefore we require one of ORC or FP unwinder
+ * to be enabled to walk kernel frames and reach BPF frames in the stack
+ * trace.
+ */
+ return IS_ENABLED(CONFIG_UNWINDER_ORC) || IS_ENABLED(CONFIG_UNWINDER_FRAME_POINTER);
+}
+
+void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
+{
+#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
+ struct unwind_state state;
+ unsigned long addr;
+
+ for (unwind_start(&state, current, NULL, NULL); !unwind_done(&state);
+ unwind_next_frame(&state)) {
+ addr = unwind_get_return_address(&state);
+ if (!addr || !consume_fn(cookie, (u64)addr, (u64)state.sp, (u64)state.bp))
+ break;
+ }
+ return;
+#endif
+ WARN(1, "verification of programs using bpf_throw should have failed\n");
+}
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b9e573159432..30063a760b5a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1389,6 +1389,7 @@ struct bpf_prog_aux {
u32 stack_depth;
u32 id;
u32 func_cnt; /* used by non-func prog as the number of func progs */
+ u32 real_func_cnt; /* includes hidden progs, only used for JIT and freeing progs */
u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
u32 attach_btf_id; /* in-kernel BTF type id to attach to */
u32 ctx_arg_info_size;
@@ -1409,6 +1410,8 @@ struct bpf_prog_aux {
bool sleepable;
bool tail_call_reachable;
bool xdp_has_frags;
+ bool exception_cb;
+ bool exception_boundary;
/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
const struct btf_type *attach_func_proto;
/* function name for valid attach_btf_id */
@@ -1431,6 +1434,7 @@ struct bpf_prog_aux {
int cgroup_atype; /* enum cgroup_bpf_attach_type */
struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
char name[BPF_OBJ_NAME_LEN];
+ unsigned int (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp);
#ifdef CONFIG_SECURITY
void *security;
#endif
@@ -2418,9 +2422,11 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
struct bpf_reg_state *regs);
int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
- struct bpf_reg_state *reg);
+ struct bpf_reg_state *reg, bool is_ex_cb);
int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
struct btf *btf, const struct btf_type *t);
+const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
+ int comp_idx, const char *tag_key);
struct bpf_prog *bpf_prog_by_id(u32 id);
struct bpf_link *bpf_link_by_id(u32 id);
@@ -3194,4 +3200,9 @@ static inline gfp_t bpf_memcg_flags(gfp_t flags)
return flags;
}
+static inline bool bpf_is_subprog(const struct bpf_prog *prog)
+{
+ return prog->aux->func_idx != 0;
+}
+
#endif /* _LINUX_BPF_H */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index a3236651ec64..94ec766432f5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -300,6 +300,7 @@ struct bpf_func_state {
bool in_callback_fn;
struct tnum callback_ret_range;
bool in_async_callback_fn;
+ bool in_exception_callback_fn;
/* The following fields should be last. See copy_func_state() */
int acquired_refs;
@@ -541,7 +542,9 @@ struct bpf_subprog_info {
bool has_tail_call;
bool tail_call_reachable;
bool has_ld_abs;
+ bool is_cb;
bool is_async_cb;
+ bool is_exception_cb;
};
struct bpf_verifier_env;
@@ -588,6 +591,8 @@ struct bpf_verifier_env {
u32 used_map_cnt; /* number of used maps */
u32 used_btf_cnt; /* number of used BTF objects */
u32 id_gen; /* used to generate unique reg IDs */
+ u32 hidden_subprog_cnt; /* number of hidden subprogs */
+ int exception_callback_subprog;
bool explore_alu_limits;
bool allow_ptr_leaks;
bool allow_uninit_stack;
@@ -595,10 +600,11 @@ struct bpf_verifier_env {
bool bypass_spec_v1;
bool bypass_spec_v4;
bool seen_direct_write;
+ bool seen_exception;
struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
const struct bpf_line_info *prev_linfo;
struct bpf_verifier_log log;
- struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1];
+ struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 2]; /* max + 2 for the fake and exception subprogs */
union {
struct bpf_idmap idmap_scratch;
struct bpf_idset idset_scratch;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0138832ad571..27406aee2d40 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -954,6 +954,8 @@ bool bpf_jit_needs_zext(void);
bool bpf_jit_supports_subprog_tailcalls(void);
bool bpf_jit_supports_kfunc_call(void);
bool bpf_jit_supports_far_kfunc_call(void);
+bool bpf_jit_supports_exceptions(void);
+void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
bool bpf_helper_changes_pkt_data(void *func);
static inline bool bpf_dump_raw_ok(const struct cred *cred)
@@ -1169,6 +1171,7 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
bool is_bpf_text_address(unsigned long addr);
int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
char *sym);
+struct bpf_prog *bpf_prog_ksym_find(unsigned long addr);
static inline const char *
bpf_address_lookup(unsigned long addr, unsigned long *size,
@@ -1236,6 +1239,11 @@ static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value,
return -ERANGE;
}
+static inline struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
+{
+ return NULL;
+}
+
static inline const char *
bpf_address_lookup(unsigned long addr, unsigned long *size,
unsigned long *off, char **modname, char *sym)
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 819b6bc8ac08..7a463f814db2 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -283,8 +283,10 @@ static inline bool kasan_check_byte(const void *address)
#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK)
void kasan_unpoison_task_stack(struct task_struct *task);
+asmlinkage void kasan_unpoison_task_stack_below(const void *watermark);
#else
static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
+static inline void kasan_unpoison_task_stack_below(const void *watermark) {}
#endif
#ifdef CONFIG_KASAN_GENERIC
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 187b57276fec..f93e835d90af 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3310,10 +3310,10 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
return BTF_FIELD_FOUND;
}
-static const char *btf_find_decl_tag_value(const struct btf *btf,
- const struct btf_type *pt,
- int comp_idx, const char *tag_key)
+const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
+ int comp_idx, const char *tag_key)
{
+ const char *value = NULL;
int i;
for (i = 1; i < btf_nr_types(btf); i++) {
@@ -3327,9 +3327,14 @@ static const char *btf_find_decl_tag_value(const struct btf *btf,
continue;
if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len))
continue;
- return __btf_name_by_offset(btf, t->name_off) + len;
+ /* Prevent duplicate entries for same type */
+ if (value)
+ return ERR_PTR(-EEXIST);
+ value = __btf_name_by_offset(btf, t->name_off) + len;
}
- return NULL;
+ if (!value)
+ return ERR_PTR(-ENOENT);
+ return value;
}
static int
@@ -3347,7 +3352,7 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
if (t->size != sz)
return BTF_FIELD_IGNORE;
value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:");
- if (!value_type)
+ if (IS_ERR(value_type))
return -EINVAL;
node_field_name = strstr(value_type, ":");
if (!node_field_name)
@@ -6954,7 +6959,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
* (either PTR_TO_CTX or SCALAR_VALUE).
*/
int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
- struct bpf_reg_state *regs)
+ struct bpf_reg_state *regs, bool is_ex_cb)
{
struct bpf_verifier_log *log = &env->log;
struct bpf_prog *prog = env->prog;
@@ -7011,7 +7016,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
tname, nargs, MAX_BPF_FUNC_REG_ARGS);
return -EINVAL;
}
- /* check that function returns int */
+ /* check that function returns int, exception cb also requires this */
t = btf_type_by_id(btf, t->type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
@@ -7060,6 +7065,14 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
i, btf_type_str(t), tname);
return -EINVAL;
}
+ /* We have already ensured that the callback returns an integer, just
+ * like all global subprogs. We need to determine it only has a single
+ * scalar argument.
+ */
+ if (is_ex_cb && (nargs != 1 || regs[BPF_REG_1].type != SCALAR_VALUE)) {
+ bpf_log(log, "exception cb only supports single integer argument\n");
+ return -EINVAL;
+ }
return 0;
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 95599df82ee4..8f921b6d6981 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -212,7 +212,7 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
const struct bpf_line_info *linfo;
void **jited_linfo;
- if (!prog->aux->jited_linfo)
+ if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt)
/* Userspace did not provide linfo */
return;
@@ -539,7 +539,7 @@ static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
{
int i;
- for (i = 0; i < fp->aux->func_cnt; i++)
+ for (i = 0; i < fp->aux->real_func_cnt; i++)
bpf_prog_kallsyms_del(fp->aux->func[i]);
}
@@ -589,7 +589,7 @@ bpf_prog_ksym_set_name(struct bpf_prog *prog)
sym = bin2hex(sym, prog->tag, sizeof(prog->tag));
/* prog->aux->name will be ignored if full btf name is available */
- if (prog->aux->func_info_cnt) {
+ if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) {
type = btf_type_by_id(prog->aux->btf,
prog->aux->func_info[prog->aux->func_idx].type_id);
func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
@@ -623,7 +623,11 @@ static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
if (val < ksym->start)
return -1;
- if (val >= ksym->end)
+ /* Ensure that we detect return addresses as part of the program, when
+ * the final instruction is a call for a program part of the stack
+ * trace. Therefore, do val > ksym->end instead of val >= ksym->end.
+ */
+ if (val > ksym->end)
return 1;
return 0;
@@ -733,7 +737,7 @@ bool is_bpf_text_address(unsigned long addr)
return ret;
}
-static struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
+struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
{
struct bpf_ksym *ksym = bpf_ksym_find(addr);
@@ -1208,7 +1212,7 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
if (!extra_pass)
addr = NULL;
else if (prog->aux->func &&
- off >= 0 && off < prog->aux->func_cnt)
+ off >= 0 && off < prog->aux->real_func_cnt)
addr = (u8 *)prog->aux->func[off]->bpf_func;
else
return -EINVAL;
@@ -2721,7 +2725,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
#endif
if (aux->dst_trampoline)
bpf_trampoline_put(aux->dst_trampoline);
- for (i = 0; i < aux->func_cnt; i++) {
+ for (i = 0; i < aux->real_func_cnt; i++) {
/* We can just unlink the subprog poke descriptor table as
* it was originally linked to the main program and is also
* released along with it.
@@ -2729,7 +2733,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
aux->func[i]->aux->poke_tab = NULL;
bpf_jit_free(aux->func[i]);
}
- if (aux->func_cnt) {
+ if (aux->real_func_cnt) {
kfree(aux->func);
bpf_prog_unlock_free(aux->prog);
} else {
@@ -2914,6 +2918,15 @@ int __weak bpf_arch_text_invalidate(void *dst, size_t len)
return -ENOTSUPP;
}
+bool __weak bpf_jit_supports_exceptions(void)
+{
+ return false;
+}
+
+void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
+{
+}
+
#ifdef CONFIG_BPF_SYSCALL
static int __init bpf_global_ma_init(void)
{
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index b0a9834f1051..7ff2a42f1996 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -22,6 +22,7 @@
#include <linux/security.h>
#include <linux/btf_ids.h>
#include <linux/bpf_mem_alloc.h>
+#include <linux/kasan.h>
#include "../../lib/kstrtox.h"
@@ -2449,6 +2450,49 @@ __bpf_kfunc void bpf_rcu_read_unlock(void)
rcu_read_unlock();
}
+struct bpf_throw_ctx {
+ struct bpf_prog_aux *aux;
+ u64 sp;
+ u64 bp;
+ int cnt;
+};
+
+static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp)
+{
+ struct bpf_throw_ctx *ctx = cookie;
+ struct bpf_prog *prog;
+
+ if (!is_bpf_text_address(ip))
+ return !ctx->cnt;
+ prog = bpf_prog_ksym_find(ip);
+ ctx->cnt++;
+ if (bpf_is_subprog(prog))
+ return true;
+ ctx->aux = prog->aux;
+ ctx->sp = sp;
+ ctx->bp = bp;
+ return false;
+}
+
+__bpf_kfunc void bpf_throw(u64 cookie)
+{
+ struct bpf_throw_ctx ctx = {};
+
+ arch_bpf_stack_walk(bpf_stack_walker, &ctx);
+ WARN_ON_ONCE(!ctx.aux);
+ if (ctx.aux)
+ WARN_ON_ONCE(!ctx.aux->exception_boundary);
+ WARN_ON_ONCE(!ctx.bp);
+ WARN_ON_ONCE(!ctx.cnt);
+ /* Prevent KASAN false positives for CONFIG_KASAN_STACK by unpoisoning
+ * deeper stack depths than ctx.sp as we do not return from bpf_throw,
+ * which skips compiler generated instrumentation to do the same.
+ */
+ kasan_unpoison_task_stack_below((void *)ctx.sp);
+ ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp);
+ WARN(1, "A call to BPF exception callback should never return\n");
+}
+
__diag_pop();
BTF_SET8_START(generic_btf_ids)
@@ -2478,6 +2522,7 @@ BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
#endif
BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_throw)
BTF_SET8_END(generic_btf_ids)
static const struct btf_kfunc_id_set generic_kfunc_set = {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6a692f3bea15..85c1d908f70f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2749,7 +2749,7 @@ free_used_maps:
* period before we can tear down JIT memory since symbols
* are already exposed under kallsyms.
*/
- __bpf_prog_put_noref(prog, prog->aux->func_cnt);
+ __bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
return err;
free_prog_sec:
free_uid(prog->aux->user);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 18e673c0ac15..a7178ecf676d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -543,6 +543,7 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)
}
static bool is_callback_calling_kfunc(u32 btf_id);
+static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
static bool is_callback_calling_function(enum bpf_func_id func_id)
{
@@ -1748,7 +1749,9 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
return -ENOMEM;
dst_state->jmp_history_cnt = src->jmp_history_cnt;
- /* if dst has more stack frames then src frame, free them */
+ /* if dst has more stack frames then src frame, free them, this is also
+ * necessary in case of exceptional exits using bpf_throw.
+ */
for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
free_func_state(dst_state->frame[i]);
dst_state->frame[i] = NULL;
@@ -2454,6 +2457,68 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
return env->subprog_cnt - 1;
}
+static int bpf_find_exception_callback_insn_off(struct bpf_verifier_env *env)
+{
+ struct bpf_prog_aux *aux = env->prog->aux;
+ struct btf *btf = aux->btf;
+ const struct btf_type *t;
+ u32 main_btf_id, id;
+ const char *name;
+ int ret, i;
+
+ /* Non-zero func_info_cnt implies valid btf */
+ if (!aux->func_info_cnt)
+ return 0;
+ main_btf_id = aux->func_info[0].type_id;
+
+ t = btf_type_by_id(btf, main_btf_id);
+ if (!t) {
+ verbose(env, "invalid btf id for main subprog in func_info\n");
+ return -EINVAL;
+ }
+
+ name = btf_find_decl_tag_value(btf, t, -1, "exception_callback:");
+ if (IS_ERR(name)) {
+ ret = PTR_ERR(name);
+ /* If there is no tag present, there is no exception callback */
+ if (ret == -ENOENT)
+ ret = 0;
+ else if (ret == -EEXIST)
+ verbose(env, "multiple exception callback tags for main subprog\n");
+ return ret;
+ }
+
+ ret = btf_find_by_name_kind(btf, name, BTF_KIND_FUNC);
+ if (ret < 0) {
+ verbose(env, "exc