aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-06-17 09:18:14 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2026-06-17 09:18:14 +0100
commit9c87e61e3c5797277407ba5eae4eac8a52be3fa3 (patch)
treee3f902cb5363b5b90ab74a4b7e26fafbc15aaeaf /kernel
parentb85966adbf5de0668a815c6e3527f87e0c387fb4 (diff)
parente4287bf34f97a88c7d9322f5bde828724c073a6b (diff)
Merge tag 'bpf-next-7.2' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Pull bpf updates from Alexei Starovoitov: "Major changes: - Recover from BPF arena page faults using a scratch page and add ptep_try_set() for lockless empty-slot installs on x86 and arm64. This allows BPF kfuncs to access arena pointers directly. The 'arena_direct_access' stable branch was created for this work and was pulled into sched-ext and bpf-next trees (Tejun Heo, Kumar Kartikeya Dwivedi) - Lift old restriction and support 6+ arguments in BPF programs and kfuncs on x86 and arm64 (Yonghong Song, Puranjay Mohan) Other features and fixes: - Add 24-bit BTF vlen and reclaim unused bits in the BTF UAPI to ease addition of new BTF kinds (Alan Maguire) - Raise the maximum BPF call chain depth from 8 to 16 frames (Alexei Starovoitov) - Refactor object relationship tracking in the verifier and fix a dynptr use-after-free bug (Amery Hung) - Harden the signed program loader and reject exclusive maps as inner maps (Daniel Borkmann) - Replace the verifier min/max bounds fields with a circular number (cnum) representation and improve 32->64 bit range refinements (Eduard Zingerman) - Introduce the arena library and runtime (libarena) with a buddy allocator, rbtree and SPMC queue data structures, ASAN support and a parallel test harness. Allow subprograms to return arena pointers and switch to a BTF type-tag based __arena annotation (Emil Tsalapatis) - Cache build IDs in the sleepable stackmap path and avoid faultable build ID reads under mm locks (Ihor Solodrai) - Introduce the tracing_multi link to attach a single BPF program to many kernel functions at once. Allow specifying the uprobe_multi target via FD (Jiri Olsa) - Extend the bpf_list family of kfuncs with bpf_list_add/del(), and bpf_list_is_first/is_last/empty() (Kaitao Cheng) - Extend the BPF syscall with common attributes support for prog_load, btf_load and map_create (Leon Hwang) - Wrap rhashtable as BPF map (Mykyta Yatsenko, Herbert Xu) - Add sleepable support for tracepoint programs and fix deadlocks in LRU map due to NMI reentry (Mykyta Yatsenko) - Fix OOB access in bpf_flow_keys, fix nullness analysis of inner arrays, enforce write checks for global subprograms (Nuoqi Gui) - Report the maximum combined stack depth and print a breakdown of instructions processed per subprogram (Paul Chaignon) - Add an XDP load-balancer benchmark and arm64 JIT support for stack arguments (Puranjay Mohan) - Add kfuncs to traverse over wakeup_sources (Samuel Wu) - Allow sleepable BPF programs to use LPM trie maps directly (Vlad Poenaru) - Many more fixes and cleanups across the verifier, BTF, sockmap, devmap, bpffs, security hooks, s390/riscv/loongarch JITs, rqspinlock, libbpf, bpftool, selftests" * tag 'bpf-next-7.2' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (336 commits) selftests/bpf: Work around llvm stack overflow in crypto progs selftests/bpf: add test for bpf_msg_pop_data() overflow bpf, sockmap: fix integer overflow in bpf_msg_pop_data() bounds check sockmap: Fix use-after-free in udp_bpf_recvmsg() bpf, sockmap: keep sk_msg copy state in sync bpf, sockmap: Fix wrong rsge offset in bpf_msg_push_data() bpf, sockmap: reject overflowing copy + len in bpf_msg_push_data() selftsets/bpf: Retry map update on helper_fill_hashmap() selftests/bpf: Add test for sleepable lsm_cgroup rejection selftests/bpf: Add test to verify the fix for bpf_setsockopt() helper bpf: Fix bpf_get/setsockopt to tos for ipv4-mapped ipv6 socket selftests/bpf: Avoid static LLVM linking for cross builds selftests/bpf: Use common CFLAGS for urandom_read selftests/bpf: Initialize operation name before use tools/bpf: build: Append extra cflags libbpf: Initialize CFLAGS before including Makefile.include bpftool: Append extra host flags bpftool: Avoid adding EXTRA_CFLAGS to HOST_CFLAGS bpftool: Pass host flags to bootstrap libbpf selftests/bpf: correct CONFIG_PPC64 macro name in comment ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arena.c315
-rw-r--r--kernel/bpf/arraymap.c14
-rw-r--r--kernel/bpf/backtrack.c82
-rw-r--r--kernel/bpf/bpf_lru_list.c165
-rw-r--r--kernel/bpf/bpf_lru_list.h25
-rw-r--r--kernel/bpf/bpf_lsm.c20
-rw-r--r--kernel/bpf/bpf_struct_ops.c63
-rw-r--r--kernel/bpf/btf.c318
-rw-r--r--kernel/bpf/cgroup.c65
-rw-r--r--kernel/bpf/cnum.c120
-rw-r--r--kernel/bpf/cnum_defs.h247
-rw-r--r--kernel/bpf/const_fold.c8
-rw-r--r--kernel/bpf/core.c27
-rw-r--r--kernel/bpf/devmap.c19
-rw-r--r--kernel/bpf/fixups.c27
-rw-r--r--kernel/bpf/hashtab.c840
-rw-r--r--kernel/bpf/helpers.c204
-rw-r--r--kernel/bpf/inode.c4
-rw-r--r--kernel/bpf/liveness.c183
-rw-r--r--kernel/bpf/log.c132
-rw-r--r--kernel/bpf/lpm_trie.c8
-rw-r--r--kernel/bpf/map_in_map.c5
-rw-r--r--kernel/bpf/map_iter.c7
-rw-r--r--kernel/bpf/stackmap.c215
-rw-r--r--kernel/bpf/states.c67
-rw-r--r--kernel/bpf/syscall.c312
-rw-r--r--kernel/bpf/trampoline.c671
-rw-r--r--kernel/bpf/verifier.c4146
-rw-r--r--kernel/events/core.c9
-rw-r--r--kernel/trace/bpf_trace.c334
-rw-r--r--kernel/trace/ftrace.c35
-rw-r--r--kernel/trace/trace_syscalls.c110
33 files changed, 5835 insertions, 2964 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 399007b67a92..4dc41bf5780c 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
endif
CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy)
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o const_fold.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o cnum.o log.o token.o liveness.o const_fold.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 49a8f7b1beef..af49c154473d 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -53,12 +53,15 @@ struct bpf_arena {
u64 user_vm_start;
u64 user_vm_end;
struct vm_struct *kern_vm;
+ struct page *scratch_page;
struct range_tree rt;
/* protects rt */
rqspinlock_t spinlock;
struct list_head vma_list;
/* protects vma_list */
struct mutex lock;
+ u64 zap_gen;
+ struct mutex zap_mutex;
struct irq_work free_irq;
struct work_struct free_work;
struct llist_head free_spans;
@@ -83,6 +86,32 @@ u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
return arena ? arena->user_vm_start : 0;
}
+/**
+ * bpf_arena_map_kern_vm_start - kern_vm_start lookup by struct bpf_map *
+ * @map: a BPF_MAP_TYPE_ARENA map
+ *
+ * Return @map's kern_vm_start.
+ */
+u64 bpf_arena_map_kern_vm_start(struct bpf_map *map)
+{
+ return bpf_arena_get_kern_vm_start(container_of(map, struct bpf_arena, map));
+}
+
+/**
+ * bpf_prog_arena - return the bpf_map of the arena referenced by @prog
+ * @prog: a loaded BPF program
+ *
+ * The verifier enforces at most one arena per program and stores it in
+ * prog->aux->arena. Return that arena's underlying bpf_map, or NULL if
+ * @prog does not reference an arena.
+ */
+struct bpf_map *bpf_prog_arena(struct bpf_prog *prog)
+{
+ struct bpf_arena *arena = prog->aux->arena;
+
+ return arena ? &arena->map : NULL;
+}
+
static long arena_map_peek_elem(struct bpf_map *map, void *value)
{
return -EOPNOTSUPP;
@@ -115,26 +144,57 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr)
struct apply_range_data {
struct page **pages;
+ struct page *scratch_page;
int i;
};
+struct clear_range_data {
+ struct llist_head *free_pages;
+ struct page *scratch_page;
+};
+
static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
{
struct apply_range_data *d = data;
struct page *page;
+ pte_t pteval;
if (!data)
return 0;
- /* sanity check */
- if (unlikely(!pte_none(ptep_get(pte))))
- return -EBUSY;
page = d->pages[d->i];
/* paranoia, similar to vmap_pages_pte_range() */
if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page))))
return -EINVAL;
- set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
+ pteval = mk_pte(page, PAGE_KERNEL);
+#ifdef ptep_try_set
+ /*
+ * Kernel-fault recovery may have installed the scratch page here, and
+ * some architectures (arm64) prohibit valid->valid PTE transitions.
+ * Install atomically into a none slot. If scratch is present, clear it
+ * and flush_tlb_before_set() (break-before-make) before retrying.
+ */
+ while (!ptep_try_set(pte, pteval)) {
+ pte_t old = ptep_get(pte);
+
+ if (pte_none(old))
+ continue;
+ if (WARN_ON_ONCE(pte_page(old) != d->scratch_page))
+ return -EBUSY;
+ ptep_get_and_clear(&init_mm, addr, pte);
+ flush_tlb_before_set(addr);
+ }
+#else
+ /*
+ * Without ptep_try_set() there is no atomic installer, but such arches
+ * also do not wire up bpf_arena_handle_page_fault(), so no scratch page
+ * is ever installed and the slot is always none here.
+ */
+ if (unlikely(!pte_none(ptep_get(pte))))
+ return -EBUSY;
+ set_pte_at(&init_mm, addr, pte, pteval);
+#endif
d->i++;
return 0;
}
@@ -144,33 +204,59 @@ static void flush_vmap_cache(unsigned long start, unsigned long size)
flush_cache_vmap(start, start + size);
}
-static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages)
+static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
{
+ struct clear_range_data *d = data;
pte_t old_pte;
struct page *page;
- /* sanity check */
- old_pte = ptep_get(pte);
+ /*
+ * Pairs with ptep_try_set() in the kernel-fault scratch installer.
+ * Both sides must be atomic.
+ */
+ old_pte = ptep_get_and_clear(&init_mm, addr, pte);
if (pte_none(old_pte) || !pte_present(old_pte))
- return 0; /* nothing to do */
+ return 0;
page = pte_page(old_pte);
if (WARN_ON_ONCE(!page))
return -EINVAL;
- pte_clear(&init_mm, addr, pte);
+ /*
+ * Skip the per-arena scratch page. A kernel fault on an unallocated uaddr
+ * scratches its PTE. A later bpf_arena_free_pages() over that range walks
+ * here. Without the skip, scratch_page would be freed.
+ */
+ if (page == d->scratch_page)
+ return 0;
+
+ __llist_add(&page->pcp_llist, d->free_pages);
+ return 0;
+}
- /* Add page to the list so it is freed later */
- if (free_pages)
- __llist_add(&page->pcp_llist, free_pages);
+static int apply_range_set_scratch_cb(pte_t *pte, unsigned long addr, void *data)
+{
+ struct page *scratch_page = data;
+ if (!pte_none(ptep_get(pte)))
+ return 0;
+ /*
+ * Best-effort install. ptep_try_set() returns false only if another
+ * installer (real allocation or concurrent fault) won the cmpxchg.
+ * Their PTE is already valid, so the access retry succeeds.
+ *
+ * No flush_tlb_kernel_range() needed. Stale "not mapped" entries just
+ * cause one extra re-fault through this same path.
+ */
+ ptep_try_set(pte, mk_pte(scratch_page, PAGE_KERNEL));
return 0;
}
static int populate_pgtable_except_pte(struct bpf_arena *arena)
{
+ /* Populate intermediates for the recovery range (4 GiB + upper half-guard). */
return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
- KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL);
+ SZ_4G + GUARD_SZ / 2, apply_range_set_cb, NULL);
}
static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
@@ -221,22 +307,30 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
init_irq_work(&arena->free_irq, arena_free_irq);
INIT_WORK(&arena->free_work, arena_free_worker);
bpf_map_init_from_attr(&arena->map, attr);
+
+ err = bpf_map_alloc_pages(&arena->map, NUMA_NO_NODE, 1, &arena->scratch_page);
+ if (err)
+ goto err_free_arena;
+
range_tree_init(&arena->rt);
err = range_tree_set(&arena->rt, 0, attr->max_entries);
- if (err) {
- bpf_map_area_free(arena);
- goto err;
- }
+ if (err)
+ goto err_free_scratch;
mutex_init(&arena->lock);
+ mutex_init(&arena->zap_mutex);
raw_res_spin_lock_init(&arena->spinlock);
err = populate_pgtable_except_pte(arena);
- if (err) {
- range_tree_destroy(&arena->rt);
- bpf_map_area_free(arena);
- goto err;
- }
+ if (err)
+ goto err_destroy_rt;
return &arena->map;
+
+err_destroy_rt:
+ range_tree_destroy(&arena->rt);
+err_free_scratch:
+ __free_page(arena->scratch_page);
+err_free_arena:
+ bpf_map_area_free(arena);
err:
free_vm_area(kern_vm);
return ERR_PTR(err);
@@ -244,6 +338,7 @@ err:
static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
{
+ struct bpf_arena *arena = data;
struct page *page;
pte_t pte;
@@ -252,6 +347,12 @@ static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
return 0;
page = pte_page(pte);
/*
+ * Skip the scratch page. The walk is page-table-driven, not range-tree-driven,
+ * so it can visit scratch PTEs at uaddrs the BPF program never allocated.
+ */
+ if (page == arena->scratch_page)
+ return 0;
+ /*
* We do not update pte here:
* 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
* 2. TLB flushing is batched or deferred. Even if we clear pte,
@@ -286,9 +387,10 @@ static void arena_map_free(struct bpf_map *map)
* free those pages.
*/
apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
- KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
+ SZ_4G + GUARD_SZ / 2, existing_page_cb, arena);
free_vm_area(arena->kern_vm);
range_tree_destroy(&arena->rt);
+ __free_page(arena->scratch_page);
bpf_map_area_free(arena);
}
@@ -318,6 +420,7 @@ struct vma_list {
struct vm_area_struct *vma;
struct list_head head;
refcount_t mmap_count;
+ u64 zap_gen;
};
static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)
@@ -330,6 +433,7 @@ static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)
refcount_set(&vml->mmap_count, 1);
vma->vm_private_data = vml;
vml->vma = vma;
+ vml->zap_gen = 0;
list_add(&vml->head, &arena->vma_list);
return 0;
}
@@ -384,33 +488,38 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
return VM_FAULT_RETRY;
page = vmalloc_to_page((void *)kaddr);
- if (page)
+ if (page) {
+ if (page == arena->scratch_page)
+ /* BPF triggered scratch here; don't lazy-alloc over it */
+ goto out_sigsegv;
/* already have a page vmap-ed */
goto out;
+ }
bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
/* User space requested to segfault when page is not allocated by bpf prog */
- goto out_unlock_sigsegv;
+ goto out_sigsegv_memcg;
ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
if (ret)
- goto out_unlock_sigsegv;
+ goto out_sigsegv_memcg;
- struct apply_range_data data = { .pages = &page, .i = 0 };
+ struct apply_range_data data = { .pages = &page, .i = 0,
+ .scratch_page = arena->scratch_page };
/* Account into memcg of the process that created bpf_arena */
ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1);
- goto out_unlock_sigsegv;
+ goto out_sigsegv_memcg;
}
ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1);
free_pages_nolock(page, 0);
- goto out_unlock_sigsegv;
+ goto out_sigsegv_memcg;
}
flush_vmap_cache(kaddr, PAGE_SIZE);
bpf_map_memcg_exit(old_memcg, new_memcg);
@@ -419,8 +528,9 @@ out:
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
vmf->page = page;
return 0;
-out_unlock_sigsegv:
+out_sigsegv_memcg:
bpf_map_memcg_exit(old_memcg, new_memcg);
+out_sigsegv:
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
return VM_FAULT_SIGSEGV;
}
@@ -587,6 +697,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
return 0;
}
data.pages = pages;
+ data.scratch_page = arena->scratch_page;
if (raw_res_spin_lock_irqsave(&arena->spinlock, flags))
goto out_free_pages;
@@ -668,12 +779,60 @@ out_free_pages:
*/
static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
{
+ unsigned long size = (unsigned long)page_cnt << PAGE_SHIFT;
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
struct vma_list *vml;
+ unsigned long vm_start;
+ u64 my_gen;
- guard(mutex)(&arena->lock);
- /* iterate link list under lock */
- list_for_each_entry(vml, &arena->vma_list, head)
- zap_vma_range(vml->vma, uaddr, PAGE_SIZE * page_cnt);
+ /*
+ * Taking mmap_read_lock() under arena->lock would deadlock against
+ * arena_vm_close(), which runs with mmap_write_lock held and then
+ * acquires arena->lock. Drop arena->lock for mmap_read_lock().
+ *
+ * Use per-call my_gen, recorded in vml->zap_gen, to remember which
+ * vmls this invocation has already processed across the lock drop.
+ * Hold zap_mutex around the whole walk so concurrent zap_pages()
+ * callers cannot overwrite each other's marks on shared vmls --
+ * otherwise call B's mark would make call A skip a vml that A has
+ * not yet zapped for A's uaddr range.
+ */
+ mutex_lock(&arena->zap_mutex);
+ mutex_lock(&arena->lock);
+ my_gen = ++arena->zap_gen;
+ for (;;) {
+ mm = NULL;
+ list_for_each_entry(vml, &arena->vma_list, head) {
+ if (vml->zap_gen >= my_gen)
+ continue;
+ vml->zap_gen = my_gen;
+ if (!mmget_not_zero(vml->vma->vm_mm))
+ continue;
+ mm = vml->vma->vm_mm;
+ vm_start = vml->vma->vm_start;
+ break;
+ }
+ if (!mm)
+ break;
+ mutex_unlock(&arena->lock);
+
+ mmap_read_lock(mm);
+ /*
+ * Re-resolve: while we waited the VMA could have been unmapped
+ * and a different mapping installed at the same address.
+ */
+ vma = find_vma(mm, vm_start);
+ if (vma && vma->vm_start == vm_start &&
+ vma->vm_file && vma->vm_file->private_data == &arena->map)
+ zap_vma_range(vma, uaddr, size);
+ mmap_read_unlock(mm);
+ mmput(mm);
+
+ mutex_lock(&arena->lock);
+ }
+ mutex_unlock(&arena->lock);
+ mutex_unlock(&arena->zap_mutex);
}
static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable)
@@ -685,6 +844,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
struct llist_head free_pages;
struct llist_node *pos, *t;
struct arena_free_span *s;
+ struct clear_range_data cdata;
unsigned long flags;
int ret = 0;
@@ -713,9 +873,11 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
range_tree_set(&arena->rt, pgoff, page_cnt);
init_llist_head(&free_pages);
+ cdata.free_pages = &free_pages;
+ cdata.scratch_page = arena->scratch_page;
/* clear ptes and collect struct pages */
apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
- apply_range_clear_cb, &free_pages);
+ apply_range_clear_cb, &cdata);
/* drop the lock to do the tlb flush and zap pages */
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
@@ -805,6 +967,7 @@ static void arena_free_worker(struct work_struct *work)
struct arena_free_span *s;
u64 arena_vm_start, user_vm_start;
struct llist_head free_pages;
+ struct clear_range_data cdata;
struct page *page;
unsigned long full_uaddr;
long kaddr, page_cnt, pgoff;
@@ -818,6 +981,8 @@ static void arena_free_worker(struct work_struct *work)
bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
init_llist_head(&free_pages);
+ cdata.free_pages = &free_pages;
+ cdata.scratch_page = arena->scratch_page;
arena_vm_start = bpf_arena_get_kern_vm_start(arena);
user_vm_start = bpf_arena_get_user_vm_start(arena);
@@ -830,7 +995,7 @@ static void arena_free_worker(struct work_struct *work)
/* clear ptes and collect pages in free_pages llist */
apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
- apply_range_clear_cb, &free_pages);
+ apply_range_clear_cb, &cdata);
range_tree_set(&arena->rt, pgoff, page_cnt);
}
@@ -893,6 +1058,19 @@ void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 pag
return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false);
}
+
+void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
+ int node_id, u64 flags)
+{
+ struct bpf_map *map = p__map;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
+ return NULL;
+
+ return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
+}
+
__bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
{
struct bpf_map *map = p__map;
@@ -945,23 +1123,12 @@ static int __init kfunc_init(void)
}
late_initcall(kfunc_init);
-void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+static void __bpf_prog_report_arena_violation(struct bpf_prog *prog, bool write,
+ unsigned long addr, unsigned long fault_ip)
{
struct bpf_stream_stage ss;
- struct bpf_prog *prog;
u64 user_vm_start;
- /*
- * The RCU read lock is held to safely traverse the latch tree, but we
- * don't need its protection when accessing the prog, since it will not
- * disappear while we are handling the fault.
- */
- rcu_read_lock();
- prog = bpf_prog_ksym_find(fault_ip);
- rcu_read_unlock();
- if (!prog)
- return;
-
/* Use main prog for stream access */
prog = prog->aux->main_prog_aux->prog;
@@ -974,3 +1141,53 @@ void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned lo
bpf_stream_dump_stack(ss);
}));
}
+
+bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip)
+{
+ struct bpf_arena *arena;
+ struct bpf_prog *prog;
+ unsigned long kbase;
+ unsigned long page_addr = addr & PAGE_MASK;
+
+ prog = bpf_prog_find_from_stack();
+ if (!prog)
+ return false;
+
+ arena = prog->aux->arena;
+ /* a prog not using arena may be on stack, so arena can be NULL */
+ if (!arena)
+ return false;
+
+ kbase = bpf_arena_get_kern_vm_start(arena);
+
+ /*
+ * Recovery covers the 4 GiB mappable band plus the upper half-guard.
+ * Lower guard is unreachable from kfuncs; an address there indicates
+ * a different bug class - leave it to the regular kernel oops path.
+ */
+ if (page_addr < kbase || page_addr >= kbase + SZ_4G + GUARD_SZ / 2)
+ return false;
+
+ apply_to_page_range(&init_mm, page_addr, PAGE_SIZE,
+ apply_range_set_scratch_cb, arena->scratch_page);
+ flush_vmap_cache(page_addr, PAGE_SIZE);
+ __bpf_prog_report_arena_violation(prog, is_write, page_addr - kbase, fault_ip);
+ return true;
+}
+
+void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+{
+ struct bpf_prog *prog;
+
+ /*
+ * The RCU read lock is held to safely traverse the latch tree, but we
+ * don't need its protection when accessing the prog, since it will not
+ * disappear while we are handling the fault.
+ */
+ rcu_read_lock();
+ prog = bpf_prog_ksym_find(fault_ip);
+ rcu_read_unlock();
+ if (!prog)
+ return;
+ __bpf_prog_report_arena_violation(prog, write, addr, fault_ip);
+}
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index dfb2110ab733..248b4818178c 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -175,14 +175,12 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
return array->value + (u64)array->elem_size * (index & array->index_mask);
}
-static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size,
- void *hash_buf)
+static int array_map_get_hash(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
sha256(array->value, (u64)array->elem_size * array->map.max_entries,
- hash_buf);
- memcpy(array->map.sha, hash_buf, sizeof(array->map.sha));
+ array->map.sha);
return 0;
}
@@ -386,7 +384,7 @@ static long array_map_update_elem(struct bpf_map *map, void *key, void *value,
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
val = this_cpu_ptr(array->pptrs[index & array->index_mask]);
copy_map_value(map, val, value);
- bpf_obj_free_fields(array->map.record, val);
+ bpf_obj_cancel_fields(map, val);
} else {
val = array->value +
(u64)array->elem_size * (index & array->index_mask);
@@ -394,7 +392,7 @@ static long array_map_update_elem(struct bpf_map *map, void *key, void *value,
copy_map_value_locked(map, val, value, false);
else
copy_map_value(map, val, value);
- bpf_obj_free_fields(array->map.record, val);
+ bpf_obj_cancel_fields(map, val);
}
return 0;
}
@@ -434,14 +432,14 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
cpu = map_flags >> 32;
ptr = per_cpu_ptr(pptr, cpu);
copy_map_value(map, ptr, value);
- bpf_obj_free_fields(array->map.record, ptr);
+ bpf_obj_cancel_fields(map, ptr);
goto unlock;
}
for_each_possible_cpu(cpu) {
ptr = per_cpu_ptr(pptr, cpu);
val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu;
copy_map_value(map, ptr, val);
- bpf_obj_free_fields(array->map.record, ptr);
+ bpf_obj_cancel_fields(map, ptr);
}
unlock:
rcu_read_unlock();
diff --git a/kernel/bpf/backtrack.c b/kernel/bpf/backtrack.c
index 854731dc93fe..2e4ae0ef0860 100644
--- a/kernel/bpf/backtrack.c
+++ b/kernel/bpf/backtrack.c
@@ -9,7 +9,7 @@
/* for any branch, call, exit record the history of jmps in the given state */
int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
- int insn_flags, u64 linked_regs)
+ int insn_flags, int spi, int frame, u64 linked_regs)
{
u32 cnt = cur->jmp_history_cnt;
struct bpf_jmp_history_entry *p;
@@ -25,6 +25,8 @@ int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state
env, "insn history: insn_idx %d cur flags %x new flags %x",
env->insn_idx, env->cur_hist_ent->flags, insn_flags);
env->cur_hist_ent->flags |= insn_flags;
+ env->cur_hist_ent->spi = spi;
+ env->cur_hist_ent->frame = frame;
verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env,
"insn history: insn_idx %d linked_regs: %#llx",
env->insn_idx, env->cur_hist_ent->linked_regs);
@@ -43,6 +45,8 @@ int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state
p->idx = env->insn_idx;
p->prev_idx = env->prev_insn_idx;
p->flags = insn_flags;
+ p->spi = spi;
+ p->frame = frame;
p->linked_regs = linked_regs;
cur->jmp_history_cnt = cnt;
env->cur_hist_ent = p;
@@ -64,16 +68,6 @@ static bool is_atomic_fetch_insn(const struct bpf_insn *insn)
(insn->imm & BPF_FETCH);
}
-static int insn_stack_access_spi(int insn_flags)
-{
- return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK;
-}
-
-static int insn_stack_access_frameno(int insn_flags)
-{
- return insn_flags & INSN_F_FRAMENO_MASK;
-}
-
/* Backtrack one insn at a time. If idx is not at the top of recorded
* history then previous instruction came from straight line execution.
* Return -ENOENT if we exhausted all instructions within given state.
@@ -135,11 +129,21 @@ static inline u32 bt_empty(struct backtrack_state *bt)
int i;
for (i = 0; i <= bt->frame; i++)
- mask |= bt->reg_masks[i] | bt->stack_masks[i];
+ mask |= bt->reg_masks[i] | bt->stack_masks[i] | bt->stack_arg_masks[i];
return mask == 0;
}
+static inline void bt_clear_frame_stack_arg_slot(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+ bt->stack_arg_masks[frame] &= ~(1 << slot);
+}
+
+static inline bool bt_is_frame_stack_arg_slot_set(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+ return bt->stack_arg_masks[frame] & (1 << slot);
+}
+
static inline int bt_subprog_enter(struct backtrack_state *bt)
{
if (bt->frame == MAX_CALL_FRAMES - 1) {
@@ -200,6 +204,11 @@ static inline u64 bt_stack_mask(struct backtrack_state *bt)
return bt->stack_masks[bt->frame];
}
+static inline u8 bt_stack_arg_mask(struct backtrack_state *bt)
+{
+ return bt->stack_arg_masks[bt->frame];
+}
+
static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
{
return bt->reg_masks[bt->frame] & (1 << reg);
@@ -341,6 +350,19 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
return 0;
bt_clear_reg(bt, load_reg);
+ if (hist && hist->flags & INSN_F_STACK_ARG_ACCESS) {
+ spi = hist->spi;
+ /*
+ * Stack arg read: callee reads from r11+off, but
+ * the data lives in the caller's stack_arg_regs.
+ * Set the mask in the caller