diff options
Diffstat (limited to 'kernel')
33 files changed, 5835 insertions, 2964 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 399007b67a92..4dc41bf5780c 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o const_fold.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o cnum.o log.o token.o liveness.o const_fold.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 49a8f7b1beef..af49c154473d 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -53,12 +53,15 @@ struct bpf_arena { u64 user_vm_start; u64 user_vm_end; struct vm_struct *kern_vm; + struct page *scratch_page; struct range_tree rt; /* protects rt */ rqspinlock_t spinlock; struct list_head vma_list; /* protects vma_list */ struct mutex lock; + u64 zap_gen; + struct mutex zap_mutex; struct irq_work free_irq; struct work_struct free_work; struct llist_head free_spans; @@ -83,6 +86,32 @@ u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) return arena ? arena->user_vm_start : 0; } +/** + * bpf_arena_map_kern_vm_start - kern_vm_start lookup by struct bpf_map * + * @map: a BPF_MAP_TYPE_ARENA map + * + * Return @map's kern_vm_start. + */ +u64 bpf_arena_map_kern_vm_start(struct bpf_map *map) +{ + return bpf_arena_get_kern_vm_start(container_of(map, struct bpf_arena, map)); +} + +/** + * bpf_prog_arena - return the bpf_map of the arena referenced by @prog + * @prog: a loaded BPF program + * + * The verifier enforces at most one arena per program and stores it in + * prog->aux->arena. Return that arena's underlying bpf_map, or NULL if + * @prog does not reference an arena. + */ +struct bpf_map *bpf_prog_arena(struct bpf_prog *prog) +{ + struct bpf_arena *arena = prog->aux->arena; + + return arena ? &arena->map : NULL; +} + static long arena_map_peek_elem(struct bpf_map *map, void *value) { return -EOPNOTSUPP; @@ -115,26 +144,57 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr) struct apply_range_data { struct page **pages; + struct page *scratch_page; int i; }; +struct clear_range_data { + struct llist_head *free_pages; + struct page *scratch_page; +}; + static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data) { struct apply_range_data *d = data; struct page *page; + pte_t pteval; if (!data) return 0; - /* sanity check */ - if (unlikely(!pte_none(ptep_get(pte)))) - return -EBUSY; page = d->pages[d->i]; /* paranoia, similar to vmap_pages_pte_range() */ if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page)))) return -EINVAL; - set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL)); + pteval = mk_pte(page, PAGE_KERNEL); +#ifdef ptep_try_set + /* + * Kernel-fault recovery may have installed the scratch page here, and + * some architectures (arm64) prohibit valid->valid PTE transitions. + * Install atomically into a none slot. If scratch is present, clear it + * and flush_tlb_before_set() (break-before-make) before retrying. + */ + while (!ptep_try_set(pte, pteval)) { + pte_t old = ptep_get(pte); + + if (pte_none(old)) + continue; + if (WARN_ON_ONCE(pte_page(old) != d->scratch_page)) + return -EBUSY; + ptep_get_and_clear(&init_mm, addr, pte); + flush_tlb_before_set(addr); + } +#else + /* + * Without ptep_try_set() there is no atomic installer, but such arches + * also do not wire up bpf_arena_handle_page_fault(), so no scratch page + * is ever installed and the slot is always none here. + */ + if (unlikely(!pte_none(ptep_get(pte)))) + return -EBUSY; + set_pte_at(&init_mm, addr, pte, pteval); +#endif d->i++; return 0; } @@ -144,33 +204,59 @@ static void flush_vmap_cache(unsigned long start, unsigned long size) flush_cache_vmap(start, start + size); } -static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages) +static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data) { + struct clear_range_data *d = data; pte_t old_pte; struct page *page; - /* sanity check */ - old_pte = ptep_get(pte); + /* + * Pairs with ptep_try_set() in the kernel-fault scratch installer. + * Both sides must be atomic. + */ + old_pte = ptep_get_and_clear(&init_mm, addr, pte); if (pte_none(old_pte) || !pte_present(old_pte)) - return 0; /* nothing to do */ + return 0; page = pte_page(old_pte); if (WARN_ON_ONCE(!page)) return -EINVAL; - pte_clear(&init_mm, addr, pte); + /* + * Skip the per-arena scratch page. A kernel fault on an unallocated uaddr + * scratches its PTE. A later bpf_arena_free_pages() over that range walks + * here. Without the skip, scratch_page would be freed. + */ + if (page == d->scratch_page) + return 0; + + __llist_add(&page->pcp_llist, d->free_pages); + return 0; +} - /* Add page to the list so it is freed later */ - if (free_pages) - __llist_add(&page->pcp_llist, free_pages); +static int apply_range_set_scratch_cb(pte_t *pte, unsigned long addr, void *data) +{ + struct page *scratch_page = data; + if (!pte_none(ptep_get(pte))) + return 0; + /* + * Best-effort install. ptep_try_set() returns false only if another + * installer (real allocation or concurrent fault) won the cmpxchg. + * Their PTE is already valid, so the access retry succeeds. + * + * No flush_tlb_kernel_range() needed. Stale "not mapped" entries just + * cause one extra re-fault through this same path. + */ + ptep_try_set(pte, mk_pte(scratch_page, PAGE_KERNEL)); return 0; } static int populate_pgtable_except_pte(struct bpf_arena *arena) { + /* Populate intermediates for the recovery range (4 GiB + upper half-guard). */ return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), - KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL); + SZ_4G + GUARD_SZ / 2, apply_range_set_cb, NULL); } static struct bpf_map *arena_map_alloc(union bpf_attr *attr) @@ -221,22 +307,30 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) init_irq_work(&arena->free_irq, arena_free_irq); INIT_WORK(&arena->free_work, arena_free_worker); bpf_map_init_from_attr(&arena->map, attr); + + err = bpf_map_alloc_pages(&arena->map, NUMA_NO_NODE, 1, &arena->scratch_page); + if (err) + goto err_free_arena; + range_tree_init(&arena->rt); err = range_tree_set(&arena->rt, 0, attr->max_entries); - if (err) { - bpf_map_area_free(arena); - goto err; - } + if (err) + goto err_free_scratch; mutex_init(&arena->lock); + mutex_init(&arena->zap_mutex); raw_res_spin_lock_init(&arena->spinlock); err = populate_pgtable_except_pte(arena); - if (err) { - range_tree_destroy(&arena->rt); - bpf_map_area_free(arena); - goto err; - } + if (err) + goto err_destroy_rt; return &arena->map; + +err_destroy_rt: + range_tree_destroy(&arena->rt); +err_free_scratch: + __free_page(arena->scratch_page); +err_free_arena: + bpf_map_area_free(arena); err: free_vm_area(kern_vm); return ERR_PTR(err); @@ -244,6 +338,7 @@ err: static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data) { + struct bpf_arena *arena = data; struct page *page; pte_t pte; @@ -252,6 +347,12 @@ static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data) return 0; page = pte_page(pte); /* + * Skip the scratch page. The walk is page-table-driven, not range-tree-driven, + * so it can visit scratch PTEs at uaddrs the BPF program never allocated. + */ + if (page == arena->scratch_page) + return 0; + /* * We do not update pte here: * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug * 2. TLB flushing is batched or deferred. Even if we clear pte, @@ -286,9 +387,10 @@ static void arena_map_free(struct bpf_map *map) * free those pages. */ apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), - KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL); + SZ_4G + GUARD_SZ / 2, existing_page_cb, arena); free_vm_area(arena->kern_vm); range_tree_destroy(&arena->rt); + __free_page(arena->scratch_page); bpf_map_area_free(arena); } @@ -318,6 +420,7 @@ struct vma_list { struct vm_area_struct *vma; struct list_head head; refcount_t mmap_count; + u64 zap_gen; }; static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) @@ -330,6 +433,7 @@ static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) refcount_set(&vml->mmap_count, 1); vma->vm_private_data = vml; vml->vma = vma; + vml->zap_gen = 0; list_add(&vml->head, &arena->vma_list); return 0; } @@ -384,33 +488,38 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) return VM_FAULT_RETRY; page = vmalloc_to_page((void *)kaddr); - if (page) + if (page) { + if (page == arena->scratch_page) + /* BPF triggered scratch here; don't lazy-alloc over it */ + goto out_sigsegv; /* already have a page vmap-ed */ goto out; + } bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) /* User space requested to segfault when page is not allocated by bpf prog */ - goto out_unlock_sigsegv; + goto out_sigsegv_memcg; ret = range_tree_clear(&arena->rt, vmf->pgoff, 1); if (ret) - goto out_unlock_sigsegv; + goto out_sigsegv_memcg; - struct apply_range_data data = { .pages = &page, .i = 0 }; + struct apply_range_data data = { .pages = &page, .i = 0, + .scratch_page = arena->scratch_page }; /* Account into memcg of the process that created bpf_arena */ ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); - goto out_unlock_sigsegv; + goto out_sigsegv_memcg; } ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); free_pages_nolock(page, 0); - goto out_unlock_sigsegv; + goto out_sigsegv_memcg; } flush_vmap_cache(kaddr, PAGE_SIZE); bpf_map_memcg_exit(old_memcg, new_memcg); @@ -419,8 +528,9 @@ out: raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); vmf->page = page; return 0; -out_unlock_sigsegv: +out_sigsegv_memcg: bpf_map_memcg_exit(old_memcg, new_memcg); +out_sigsegv: raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); return VM_FAULT_SIGSEGV; } @@ -587,6 +697,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt return 0; } data.pages = pages; + data.scratch_page = arena->scratch_page; if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) goto out_free_pages; @@ -668,12 +779,60 @@ out_free_pages: */ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) { + unsigned long size = (unsigned long)page_cnt << PAGE_SHIFT; + struct vm_area_struct *vma; + struct mm_struct *mm; struct vma_list *vml; + unsigned long vm_start; + u64 my_gen; - guard(mutex)(&arena->lock); - /* iterate link list under lock */ - list_for_each_entry(vml, &arena->vma_list, head) - zap_vma_range(vml->vma, uaddr, PAGE_SIZE * page_cnt); + /* + * Taking mmap_read_lock() under arena->lock would deadlock against + * arena_vm_close(), which runs with mmap_write_lock held and then + * acquires arena->lock. Drop arena->lock for mmap_read_lock(). + * + * Use per-call my_gen, recorded in vml->zap_gen, to remember which + * vmls this invocation has already processed across the lock drop. + * Hold zap_mutex around the whole walk so concurrent zap_pages() + * callers cannot overwrite each other's marks on shared vmls -- + * otherwise call B's mark would make call A skip a vml that A has + * not yet zapped for A's uaddr range. + */ + mutex_lock(&arena->zap_mutex); + mutex_lock(&arena->lock); + my_gen = ++arena->zap_gen; + for (;;) { + mm = NULL; + list_for_each_entry(vml, &arena->vma_list, head) { + if (vml->zap_gen >= my_gen) + continue; + vml->zap_gen = my_gen; + if (!mmget_not_zero(vml->vma->vm_mm)) + continue; + mm = vml->vma->vm_mm; + vm_start = vml->vma->vm_start; + break; + } + if (!mm) + break; + mutex_unlock(&arena->lock); + + mmap_read_lock(mm); + /* + * Re-resolve: while we waited the VMA could have been unmapped + * and a different mapping installed at the same address. + */ + vma = find_vma(mm, vm_start); + if (vma && vma->vm_start == vm_start && + vma->vm_file && vma->vm_file->private_data == &arena->map) + zap_vma_range(vma, uaddr, size); + mmap_read_unlock(mm); + mmput(mm); + + mutex_lock(&arena->lock); + } + mutex_unlock(&arena->lock); + mutex_unlock(&arena->zap_mutex); } static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) @@ -685,6 +844,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, struct llist_head free_pages; struct llist_node *pos, *t; struct arena_free_span *s; + struct clear_range_data cdata; unsigned long flags; int ret = 0; @@ -713,9 +873,11 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, range_tree_set(&arena->rt, pgoff, page_cnt); init_llist_head(&free_pages); + cdata.free_pages = &free_pages; + cdata.scratch_page = arena->scratch_page; /* clear ptes and collect struct pages */ apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, - apply_range_clear_cb, &free_pages); + apply_range_clear_cb, &cdata); /* drop the lock to do the tlb flush and zap pages */ raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); @@ -805,6 +967,7 @@ static void arena_free_worker(struct work_struct *work) struct arena_free_span *s; u64 arena_vm_start, user_vm_start; struct llist_head free_pages; + struct clear_range_data cdata; struct page *page; unsigned long full_uaddr; long kaddr, page_cnt, pgoff; @@ -818,6 +981,8 @@ static void arena_free_worker(struct work_struct *work) bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); init_llist_head(&free_pages); + cdata.free_pages = &free_pages; + cdata.scratch_page = arena->scratch_page; arena_vm_start = bpf_arena_get_kern_vm_start(arena); user_vm_start = bpf_arena_get_user_vm_start(arena); @@ -830,7 +995,7 @@ static void arena_free_worker(struct work_struct *work) /* clear ptes and collect pages in free_pages llist */ apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, - apply_range_clear_cb, &free_pages); + apply_range_clear_cb, &cdata); range_tree_set(&arena->rt, pgoff, page_cnt); } @@ -893,6 +1058,19 @@ void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 pag return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false); } + +void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt, + int node_id, u64 flags) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) + return NULL; + + return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true); +} + __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt) { struct bpf_map *map = p__map; @@ -945,23 +1123,12 @@ static int __init kfunc_init(void) } late_initcall(kfunc_init); -void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip) +static void __bpf_prog_report_arena_violation(struct bpf_prog *prog, bool write, + unsigned long addr, unsigned long fault_ip) { struct bpf_stream_stage ss; - struct bpf_prog *prog; u64 user_vm_start; - /* - * The RCU read lock is held to safely traverse the latch tree, but we - * don't need its protection when accessing the prog, since it will not - * disappear while we are handling the fault. - */ - rcu_read_lock(); - prog = bpf_prog_ksym_find(fault_ip); - rcu_read_unlock(); - if (!prog) - return; - /* Use main prog for stream access */ prog = prog->aux->main_prog_aux->prog; @@ -974,3 +1141,53 @@ void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned lo bpf_stream_dump_stack(ss); })); } + +bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip) +{ + struct bpf_arena *arena; + struct bpf_prog *prog; + unsigned long kbase; + unsigned long page_addr = addr & PAGE_MASK; + + prog = bpf_prog_find_from_stack(); + if (!prog) + return false; + + arena = prog->aux->arena; + /* a prog not using arena may be on stack, so arena can be NULL */ + if (!arena) + return false; + + kbase = bpf_arena_get_kern_vm_start(arena); + + /* + * Recovery covers the 4 GiB mappable band plus the upper half-guard. + * Lower guard is unreachable from kfuncs; an address there indicates + * a different bug class - leave it to the regular kernel oops path. + */ + if (page_addr < kbase || page_addr >= kbase + SZ_4G + GUARD_SZ / 2) + return false; + + apply_to_page_range(&init_mm, page_addr, PAGE_SIZE, + apply_range_set_scratch_cb, arena->scratch_page); + flush_vmap_cache(page_addr, PAGE_SIZE); + __bpf_prog_report_arena_violation(prog, is_write, page_addr - kbase, fault_ip); + return true; +} + +void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip) +{ + struct bpf_prog *prog; + + /* + * The RCU read lock is held to safely traverse the latch tree, but we + * don't need its protection when accessing the prog, since it will not + * disappear while we are handling the fault. + */ + rcu_read_lock(); + prog = bpf_prog_ksym_find(fault_ip); + rcu_read_unlock(); + if (!prog) + return; + __bpf_prog_report_arena_violation(prog, write, addr, fault_ip); +} diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index dfb2110ab733..248b4818178c 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -175,14 +175,12 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + (u64)array->elem_size * (index & array->index_mask); } -static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size, - void *hash_buf) +static int array_map_get_hash(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); sha256(array->value, (u64)array->elem_size * array->map.max_entries, - hash_buf); - memcpy(array->map.sha, hash_buf, sizeof(array->map.sha)); + array->map.sha); return 0; } @@ -386,7 +384,7 @@ static long array_map_update_elem(struct bpf_map *map, void *key, void *value, if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { val = this_cpu_ptr(array->pptrs[index & array->index_mask]); copy_map_value(map, val, value); - bpf_obj_free_fields(array->map.record, val); + bpf_obj_cancel_fields(map, val); } else { val = array->value + (u64)array->elem_size * (index & array->index_mask); @@ -394,7 +392,7 @@ static long array_map_update_elem(struct bpf_map *map, void *key, void *value, copy_map_value_locked(map, val, value, false); else copy_map_value(map, val, value); - bpf_obj_free_fields(array->map.record, val); + bpf_obj_cancel_fields(map, val); } return 0; } @@ -434,14 +432,14 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, cpu = map_flags >> 32; ptr = per_cpu_ptr(pptr, cpu); copy_map_value(map, ptr, value); - bpf_obj_free_fields(array->map.record, ptr); + bpf_obj_cancel_fields(map, ptr); goto unlock; } for_each_possible_cpu(cpu) { ptr = per_cpu_ptr(pptr, cpu); val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; copy_map_value(map, ptr, val); - bpf_obj_free_fields(array->map.record, ptr); + bpf_obj_cancel_fields(map, ptr); } unlock: rcu_read_unlock(); diff --git a/kernel/bpf/backtrack.c b/kernel/bpf/backtrack.c index 854731dc93fe..2e4ae0ef0860 100644 --- a/kernel/bpf/backtrack.c +++ b/kernel/bpf/backtrack.c @@ -9,7 +9,7 @@ /* for any branch, call, exit record the history of jmps in the given state */ int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, - int insn_flags, u64 linked_regs) + int insn_flags, int spi, int frame, u64 linked_regs) { u32 cnt = cur->jmp_history_cnt; struct bpf_jmp_history_entry *p; @@ -25,6 +25,8 @@ int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state env, "insn history: insn_idx %d cur flags %x new flags %x", env->insn_idx, env->cur_hist_ent->flags, insn_flags); env->cur_hist_ent->flags |= insn_flags; + env->cur_hist_ent->spi = spi; + env->cur_hist_ent->frame = frame; verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env, "insn history: insn_idx %d linked_regs: %#llx", env->insn_idx, env->cur_hist_ent->linked_regs); @@ -43,6 +45,8 @@ int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state p->idx = env->insn_idx; p->prev_idx = env->prev_insn_idx; p->flags = insn_flags; + p->spi = spi; + p->frame = frame; p->linked_regs = linked_regs; cur->jmp_history_cnt = cnt; env->cur_hist_ent = p; @@ -64,16 +68,6 @@ static bool is_atomic_fetch_insn(const struct bpf_insn *insn) (insn->imm & BPF_FETCH); } -static int insn_stack_access_spi(int insn_flags) -{ - return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK; -} - -static int insn_stack_access_frameno(int insn_flags) -{ - return insn_flags & INSN_F_FRAMENO_MASK; -} - /* Backtrack one insn at a time. If idx is not at the top of recorded * history then previous instruction came from straight line execution. * Return -ENOENT if we exhausted all instructions within given state. @@ -135,11 +129,21 @@ static inline u32 bt_empty(struct backtrack_state *bt) int i; for (i = 0; i <= bt->frame; i++) - mask |= bt->reg_masks[i] | bt->stack_masks[i]; + mask |= bt->reg_masks[i] | bt->stack_masks[i] | bt->stack_arg_masks[i]; return mask == 0; } +static inline void bt_clear_frame_stack_arg_slot(struct backtrack_state *bt, u32 frame, u32 slot) +{ + bt->stack_arg_masks[frame] &= ~(1 << slot); +} + +static inline bool bt_is_frame_stack_arg_slot_set(struct backtrack_state *bt, u32 frame, u32 slot) +{ + return bt->stack_arg_masks[frame] & (1 << slot); +} + static inline int bt_subprog_enter(struct backtrack_state *bt) { if (bt->frame == MAX_CALL_FRAMES - 1) { @@ -200,6 +204,11 @@ static inline u64 bt_stack_mask(struct backtrack_state *bt) return bt->stack_masks[bt->frame]; } +static inline u8 bt_stack_arg_mask(struct backtrack_state *bt) +{ + return bt->stack_arg_masks[bt->frame]; +} + static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg) { return bt->reg_masks[bt->frame] & (1 << reg); @@ -341,6 +350,19 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, return 0; bt_clear_reg(bt, load_reg); + if (hist && hist->flags & INSN_F_STACK_ARG_ACCESS) { + spi = hist->spi; + /* + * Stack arg read: callee reads from r11+off, but + * the data lives in the caller's stack_arg_regs. + * Set the mask in the caller frame so precision + * is marked in the caller's slot at the callee + * entry checkpoint. + */ + bt_set_frame_stack_arg_slot(bt, bt->frame - 1, spi); + return 0; + } + /* scalars can only be spilled into stack w/o losing precision. * Load from any other memory can be zero extended. * The desire to keep that precision is already indicated @@ -353,8 +375,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * that [fp - off] slot contains scalar that needs to be * tracked with precision */ - spi = insn_stack_access_spi(hist->flags); - fr = insn_stack_access_frameno(hist->flags); + spi = hist->spi; + fr = hist->frame; bpf_bt_set_frame_slot(bt, fr, spi); } else if (class == BPF_STX || class == BPF_ST) { if (bt_is_reg_set(bt, dreg)) @@ -363,11 +385,22 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * encountered a case of pointer subtraction. */ return -ENOTSUPP; + + if (hist && hist->flags & INSN_F_STACK_ARG_ACCESS) { + spi = hist->spi; + if (!bt_is_frame_stack_arg_slot_set(bt, bt->frame, spi)) + return 0; + bt_clear_frame_stack_arg_slot(bt, bt->frame, spi); + if (class == BPF_STX) + bt_set_reg(bt, sreg); + return 0; + } + /* scalars can only be spilled into stack */ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS)) return 0; - spi = insn_stack_access_spi(hist->flags); - fr = insn_stack_access_frameno(hist->flags); + spi = hist->spi; + fr = hist->frame; if (!bt_is_frame_slot_set(bt, fr, spi)) return 0; bt_clear_frame_slot(bt, fr, spi); @@ -431,6 +464,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, bpf_bt_set_frame_reg(bt, bt->frame - 1, i); } } + if (bt_stack_arg_mask(bt)) { + verifier_bug(env, + "static subprog leftover stack arg slots %x", + bt_stack_arg_mask(bt)); + return -EFAULT; + } if (bt_subprog_exit(bt)) return -EFAULT; return 0; @@ -901,6 +940,17 @@ int bpf_mark_chain_precision(struct bpf_verifier_env *env, *changed = true; } } + for (i = 0; i < func->out_stack_arg_cnt; i++) { + if (!bt_is_frame_stack_arg_slot_set(bt, fr, i)) + continue; + reg = &func->stack_arg_regs[i]; + if (reg->type != SCALAR_VALUE || reg->precise) { + bt_clear_frame_stack_arg_slot(bt, fr, i); + } else { + reg->precise = true; + *changed = true; + } + } if (env->log.level & BPF_LOG_LEVEL2) { fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_frame_reg_mask(bt, fr)); diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index e7a2fc60523f..5ed7cb4b98c0 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -13,23 +13,8 @@ #define PERCPU_FREE_TARGET (4) #define PERCPU_NR_SCANS PERCPU_FREE_TARGET -/* Helpers to get the local list index */ -#define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET) -#define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE) -#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING) #define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET) -/* Local list helpers */ -static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l) -{ - return &loc_l->lists[LOCAL_FREE_LIST_IDX]; -} - -static struct list_head * |
