aboutsummaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorAlexei Starovoitov <ast@kernel.org>2026-05-25 08:35:07 -0700
committerAlexei Starovoitov <ast@kernel.org>2026-05-25 08:35:48 -0700
commit8496d9020ff37a33c2a7b2fc84350fd03ffbde78 (patch)
treeba0c676e678f26cff0b1c497423d933925ae2b76 /include
parenteb19eead368bb0f0ef06a4125d03ed661cd23d36 (diff)
parent53cc12a2dc88c2c6f62f507548640885a70a56a8 (diff)
Merge branch 'arena_direct_access'
Tejun Heo says: ==================== This makes BPF arena memory directly dereferenceable from kernel code (struct_ops callbacks, kfuncs). Each arena gets a per-arena scratch page that an arch fault hook installs into empty PTEs on kernel-side faults, after KFENCE. The faulting instruction retries and the violation is reported through the program's BPF stream. v4: - Patch 1: note that the strict-zero cmpxchg is narrower than pte_none() in inline comments on both x86 and arm64. (Andrea) - Patch 2: stub bpf_arena_handle_page_fault() for !CONFIG_BPF_SYSCALL via a new include/linux/bpf_defs.h. (lkp) - Patch 7: scx_arena_alloc() retries via a loop instead of a single retry on pool growth. (Andrea) - Picked up Reviewed-by tags from Emil and Andrea. v3: https://lore.kernel.org/r/20260520235052.4180316-1-tj@kernel.org v2: https://lore.kernel.org/r/20260517211232.1670594-1-tj@kernel.org v1 (RFC): https://lore.kernel.org/r/20260427105109.2554518-1-tj@kernel.org Motivation ---------- sched_ext's ops_cid.set_cmask() hands the BPF scheduler a struct scx_cmask *. The kernel translates a kernel cpumask to a cmask, but it had no way to write into the arena, so the cmask lived in kernel memory and was passed as a trusted pointer. BPF cmask helpers all operate on arena cmasks though, so the BPF side had to word-by-word probe-read the kernel cmask into an arena cmask via cmask_copy_from_kernel() before any helper could touch it. It works, but is clumsy. The shape isn't unique to set_cmask. Sub-scheduler support is on the way and more sched_ext callbacks will want to pass structured data to BPF. Anywhere a kfunc or struct_ops callback wants to hand a struct to a BPF program, arena residence is the natural answer. Approach -------- Each arena gets a per-arena scratch page. Arenas stay sparsely mapped as today - PTEs are populated only for allocated pages. A new arch fault hook (bpf_arena_handle_page_fault) is wired into x86 page_fault_oops() and arm64 __do_kernel_fault(), after KFENCE. When a kernel-side access faults inside an arena's kern_vm range, the helper walks the stack to find the BPF program responsible, range-checks the fault address against prog->aux->arena, and atomically installs the scratch page into the empty PTE via the new ptep_try_set() wrapper. The kernel instruction retries and reads/writes the scratch page. Free paths and map destruction treat scratch as non-owned. Real allocation refuses to overwrite scratch (apply_range_set_cb returns -EBUSY). A scratched address stays dead until map destroy, since its presence means the BPF program has already malfunctioned. The mechanism is default behavior - no UAPI flag. What this preserves ------------------- All the debugging properties of today's sparse-PTE design are preserved: * BPF programs still fault on unmapped arena accesses. The fault semantics (instruction retry with rdst = 0) and the violation report through bpf_streams are unchanged for prog-side accesses. * The first kernel-side touch of an unmapped address is reported via bpf_streams the same way as a prog-side fault, with the stack walk attributing it to the originating prog. * User-side fault on a never-scratched address still lazy-allocates a real page (or returns SIGSEGV under BPF_F_SEGV_ON_FAULT). User-side fault on a scratched address SIGSEGVs. What changes for the kernel-side caller is just that an unmapped deref no longer oopses - it retries through the scratch page and emits a violation report. The same shape today's BPF instruction faults have. Patches 1-2 (atomic PTE install + arena scratch-page recovery) -------------------------------------------------------------- mm: Add ptep_try_set() for lockless empty-slot installs bpf: Recover arena kernel faults with scratch page Patches 3-5 (helpers used by struct_ops registration) ----------------------------------------------------- bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers bpf: Add bpf_struct_ops_for_each_prog() bpf/arena: Add bpf_arena_map_kern_vm_start() and bpf_prog_arena() ==================== Link: https://lore.kernel.org/bpf/20260522172219.1423324-1-tj@kernel.org/ Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'include')
-rw-r--r--include/linux/bpf.h14
-rw-r--r--include/linux/bpf_defs.h19
-rw-r--r--include/linux/pgtable.h25
3 files changed, 58 insertions, 0 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1df67a59cd97..1c6863ce89e0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -6,6 +6,7 @@
#include <uapi/linux/bpf.h>
#include <uapi/linux/filter.h>
+#include <linux/bpf_defs.h>
#include <crypto/sha2.h>
#include <linux/workqueue.h>
@@ -617,6 +618,8 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
struct bpf_spin_lock *spin_lock);
u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena);
u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena);
+u64 bpf_arena_map_kern_vm_start(struct bpf_map *map);
+struct bpf_map *bpf_prog_arena(struct bpf_prog *prog);
int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);
struct bpf_offload_dev;
@@ -678,6 +681,8 @@ int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags,
void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id,
u64 flags);
void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt);
+void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id,
+ u64 flags);
#else
static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
int node_id, u64 flags)
@@ -688,6 +693,12 @@ static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr
static inline void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt)
{
}
+
+static inline void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
+ int node_id, u64 flags)
+{
+ return NULL;
+}
#endif
extern const struct bpf_map_ops bpf_map_offload_ops;
@@ -2129,6 +2140,9 @@ int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map);
void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog);
void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux);
u32 bpf_struct_ops_id(const void *kdata);
+int bpf_struct_ops_for_each_prog(const void *kdata,
+ int (*cb)(struct bpf_prog *prog, void *data),
+ void *data);
#ifdef CONFIG_NET
/* Define it here to avoid the use of forward declaration */
diff --git a/include/linux/bpf_defs.h b/include/linux/bpf_defs.h
new file mode 100644
index 000000000000..2185cd3966d4
--- /dev/null
+++ b/include/linux/bpf_defs.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Subset of bpf.h declarations, split out so files that need only these
+ * declarations can avoid bpf.h's full include cost.
+ */
+#ifndef _LINUX_BPF_DEFS_H
+#define _LINUX_BPF_DEFS_H
+
+#ifdef CONFIG_BPF_SYSCALL
+bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip);
+#else
+static inline bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write,
+ unsigned long fault_ip)
+{
+ return false;
+}
+#endif
+
+#endif /* _LINUX_BPF_DEFS_H */
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index cdd68ed3ae1a..b5739bb99fc1 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1036,6 +1036,31 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
}
#endif
+#ifndef ptep_try_set
+/**
+ * ptep_try_set - atomically set an empty kernel PTE
+ * @ptep: page table entry
+ * @new_pte: value to install
+ *
+ * Atomically set *@ptep to @new_pte iff *@ptep is pte_none(). Return true on
+ * success, false if the slot was already populated or the arch has no
+ * implementation.
+ *
+ * For special kernel page tables only - never user page tables. The caller must
+ * prevent concurrent teardown of @ptep and must accept that other writers may
+ * race. Concurrent clearers must use ptep_get_and_clear() so racing accesses
+ * agree on the outcome.
+ *
+ * Architectures opt in by providing a cmpxchg-based override and defining
+ * ptep_try_set as an identity macro. The generic stub returns false, which is
+ * correct for callers that fall through to oops on failure.
+ */
+static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte)
+{
+ return false;
+}
+#endif
+
#ifndef wrprotect_ptes
/**
* wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same