diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-05-28 15:52:42 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-05-28 15:52:42 -0700 |
| commit | 90b83efa6701656e02c86e7df2cb1765ea602d07 (patch) | |
| tree | 59ac0306b5fe287af6691717ebcdbcc96163c3ca /kernel/bpf | |
| parent | 1b98f357dadd6ea613a435fbaef1a5dd7b35fd21 (diff) | |
| parent | c5cebb241e27ed0c3f4c1d2ce63089398e0ed17e (diff) | |
Merge tag 'bpf-next-6.16' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Pull bpf updates from Alexei Starovoitov:
- Fix and improve BTF deduplication of identical BTF types (Alan
Maguire and Andrii Nakryiko)
- Support up to 12 arguments in BPF trampoline on arm64 (Xu Kuohai and
Alexis Lothoré)
- Support load-acquire and store-release instructions in BPF JIT on
riscv64 (Andrea Parri)
- Fix uninitialized values in BPF_{CORE,PROBE}_READ macros (Anton
Protopopov)
- Streamline allowed helpers across program types (Feng Yang)
- Support atomic update for hashtab of BPF maps (Hou Tao)
- Implement json output for BPF helpers (Ihor Solodrai)
- Several s390 JIT fixes (Ilya Leoshkevich)
- Various sockmap fixes (Jiayuan Chen)
- Support mmap of vmlinux BTF data (Lorenz Bauer)
- Support BPF rbtree traversal and list peeking (Martin KaFai Lau)
- Tests for sockmap/sockhash redirection (Michal Luczaj)
- Introduce kfuncs for memory reads into dynptrs (Mykyta Yatsenko)
- Add support for dma-buf iterators in BPF (T.J. Mercier)
- The verifier support for __bpf_trap() (Yonghong Song)
* tag 'bpf-next-6.16' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (135 commits)
bpf, arm64: Remove unused-but-set function and variable.
selftests/bpf: Add tests with stack ptr register in conditional jmp
bpf: Do not include stack ptr register in precision backtracking bookkeeping
selftests/bpf: enable many-args tests for arm64
bpf, arm64: Support up to 12 function arguments
bpf: Check rcu_read_lock_trace_held() in bpf_map_lookup_percpu_elem()
bpf: Avoid __bpf_prog_ret0_warn when jit fails
bpftool: Add support for custom BTF path in prog load/loadall
selftests/bpf: Add unit tests with __bpf_trap() kfunc
bpf: Warn with __bpf_trap() kfunc maybe due to uninitialized variable
bpf: Remove special_kfunc_set from verifier
selftests/bpf: Add test for open coded dmabuf_iter
selftests/bpf: Add test for dmabuf_iter
bpf: Add open coded dmabuf iterator
bpf: Add dmabuf iterator
dma-buf: Rename debugfs symbols
bpf: Fix error return value in bpf_copy_from_user_dynptr
libbpf: Use mmap to parse vmlinux BTF from sysfs
selftests: bpf: Add a test for mmapable vmlinux BTF
btf: Allow mmap of vmlinux btf
...
Diffstat (limited to 'kernel/bpf')
| -rw-r--r-- | kernel/bpf/Makefile | 3 | ||||
| -rw-r--r-- | kernel/bpf/bpf_struct_ops.c | 2 | ||||
| -rw-r--r-- | kernel/bpf/btf.c | 45 | ||||
| -rw-r--r-- | kernel/bpf/cgroup.c | 32 | ||||
| -rw-r--r-- | kernel/bpf/core.c | 29 | ||||
| -rw-r--r-- | kernel/bpf/dmabuf_iter.c | 150 | ||||
| -rw-r--r-- | kernel/bpf/hashtab.c | 148 | ||||
| -rw-r--r-- | kernel/bpf/helpers.c | 133 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 10 | ||||
| -rw-r--r-- | kernel/bpf/sysfs_btf.c | 32 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 636 |
11 files changed, 753 insertions, 467 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 70502f038b92..3a335c50e6e3 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -53,6 +53,9 @@ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o +ifeq ($(CONFIG_DMA_SHARED_BUFFER),y) +obj-$(CONFIG_BPF_SYSCALL) += dmabuf_iter.o +endif CFLAGS_REMOVE_percpu_freelist.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index db13ee70d94d..96113633e391 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -601,7 +601,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, if (model->ret_size > 0) flags |= BPF_TRAMP_F_RET_FENTRY_RET; - size = arch_bpf_trampoline_size(model, flags, tlinks, NULL); + size = arch_bpf_trampoline_size(model, flags, tlinks, stub_func); if (size <= 0) return size ? : -EFAULT; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 324c47ab377a..1d2cf898e21e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -26,6 +26,7 @@ #include <linux/bsearch.h> #include <linux/kobject.h> #include <linux/sysfs.h> +#include <linux/overflow.h> #include <net/netfilter/nf_bpf_link.h> @@ -3957,7 +3958,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* This needs to be kzalloc to zero out padding and unused fields, see * comment in btf_record_equal. */ - rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN); + rec = kzalloc(struct_size(rec, fields, cnt), GFP_KERNEL | __GFP_NOWARN); if (!rec) return ERR_PTR(-ENOMEM); @@ -5583,7 +5584,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) if (id < 0) continue; - new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]), + new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1), GFP_KERNEL | __GFP_NOWARN); if (!new_aof) { ret = -ENOMEM; @@ -5610,7 +5611,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) if (ret != BTF_FIELD_FOUND) continue; - new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]), + new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1), GFP_KERNEL | __GFP_NOWARN); if (!new_aof) { ret = -ENOMEM; @@ -5647,7 +5648,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) continue; parse: tab_cnt = tab ? tab->cnt : 0; - new_tab = krealloc(tab, offsetof(struct btf_struct_metas, types[tab_cnt + 1]), + new_tab = krealloc(tab, struct_size(new_tab, types, tab_cnt + 1), GFP_KERNEL | __GFP_NOWARN); if (!new_tab) { ret = -ENOMEM; @@ -6383,12 +6384,11 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) return prog->aux->attach_btf; } -static bool is_int_ptr(struct btf *btf, const struct btf_type *t) +static bool is_void_or_int_ptr(struct btf *btf, const struct btf_type *t) { /* skip modifiers */ t = btf_type_skip_modifiers(btf, t->type, NULL); - - return btf_type_is_int(t); + return btf_type_is_void(t) || btf_type_is_int(t); } u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, @@ -6777,14 +6777,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } } - if (t->type == 0) - /* This is a pointer to void. - * It is the same as scalar from the verifier safety pov. - * No further pointer walking is allowed. - */ - return true; - - if (is_int_ptr(btf, t)) + /* + * If it's a pointer to void, it's the same as scalar from the verifier + * safety POV. Either way, no futher pointer walking is allowed. + */ + if (is_void_or_int_ptr(btf, t)) return true; /* this is a pointer to another type */ @@ -6830,10 +6827,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* Is this a func with potential NULL args? */ if (strcmp(tname, raw_tp_null_args[i].func)) continue; - if (raw_tp_null_args[i].mask & (0x1 << (arg * 4))) + if (raw_tp_null_args[i].mask & (0x1ULL << (arg * 4))) info->reg_type |= PTR_MAYBE_NULL; /* Is the current arg IS_ERR? */ - if (raw_tp_null_args[i].mask & (0x2 << (arg * 4))) + if (raw_tp_null_args[i].mask & (0x2ULL << (arg * 4))) ptr_err_raw_tp = true; break; } @@ -7663,7 +7660,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) return 0; if (!prog->aux->func_info) { - bpf_log(log, "Verifier bug\n"); + verifier_bug(env, "func_info undefined"); return -EFAULT; } @@ -7687,7 +7684,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) tname = btf_name_by_offset(btf, fn_t->name_off); if (prog->aux->func_info_aux[subprog].unreliable) { - bpf_log(log, "Verifier bug in function %s()\n", tname); + verifier_bug(env, "unreliable BTF for function %s()", tname); return -EFAULT; } if (prog_type == BPF_PROG_TYPE_EXT) @@ -8564,7 +8561,7 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, /* Grow set */ set = krealloc(tab->sets[hook], - offsetof(struct btf_id_set8, pairs[set_cnt + add_set->cnt]), + struct_size(set, pairs, set_cnt + add_set->cnt), GFP_KERNEL | __GFP_NOWARN); if (!set) { ret = -ENOMEM; @@ -8850,7 +8847,7 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c } tab = krealloc(btf->dtor_kfunc_tab, - offsetof(struct btf_id_dtor_kfunc_tab, dtors[tab_cnt + add_cnt]), + struct_size(tab, dtors, tab_cnt + add_cnt), GFP_KERNEL | __GFP_NOWARN); if (!tab) { ret = -ENOMEM; @@ -9408,8 +9405,7 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops, tab = btf->struct_ops_tab; if (!tab) { - tab = kzalloc(offsetof(struct btf_struct_ops_tab, ops[4]), - GFP_KERNEL); + tab = kzalloc(struct_size(tab, ops, 4), GFP_KERNEL); if (!tab) return -ENOMEM; tab->capacity = 4; @@ -9422,8 +9418,7 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops, if (tab->cnt == tab->capacity) { new_tab = krealloc(tab, - offsetof(struct btf_struct_ops_tab, - ops[tab->capacity * 2]), + struct_size(tab, ops, tab->capacity * 2), GFP_KERNEL); if (!new_tab) return -ENOMEM; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index cad0194552fb..9122c39870bf 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1687,10 +1687,6 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; @@ -2238,10 +2234,6 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { case BPF_FUNC_sysctl_get_name: return &bpf_sysctl_get_name_proto; @@ -2385,10 +2377,6 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { #ifdef CONFIG_NET case BPF_FUNC_get_netns_cookie: @@ -2635,23 +2623,3 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return NULL; } } - -/* Common helpers for cgroup hooks with valid process context. */ -const struct bpf_func_proto * -cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - switch (func_id) { - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif - case BPF_FUNC_current_task_under_cgroup: - return &bpf_current_task_under_cgroup_proto; - default: - return NULL; - } -} diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ba6b6118cf50..c20babbf998f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2358,8 +2358,8 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, return 0; } -bool bpf_prog_map_compatible(struct bpf_map *map, - const struct bpf_prog *fp) +static bool __bpf_prog_map_compatible(struct bpf_map *map, + const struct bpf_prog *fp) { enum bpf_prog_type prog_type = resolve_prog_type(fp); bool ret; @@ -2368,14 +2368,6 @@ bool bpf_prog_map_compatible(struct bpf_map *map, if (fp->kprobe_override) return false; - /* XDP programs inserted into maps are not guaranteed to run on - * a particular netdev (and can run outside driver context entirely - * in the case of devmap and cpumap). Until device checks - * are implemented, prohibit adding dev-bound programs to program maps. - */ - if (bpf_prog_is_dev_bound(aux)) - return false; - spin_lock(&map->owner.lock); if (!map->owner.type) { /* There's no owner yet where we could check for @@ -2409,6 +2401,19 @@ bool bpf_prog_map_compatible(struct bpf_map *map, return ret; } +bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp) +{ + /* XDP programs inserted into maps are not guaranteed to run on + * a particular netdev (and can run outside driver context entirely + * in the case of devmap and cpumap). Until device checks + * are implemented, prohibit adding dev-bound programs to program maps. + */ + if (bpf_prog_is_dev_bound(fp->aux)) + return false; + + return __bpf_prog_map_compatible(map, fp); +} + static int bpf_check_tail_call(const struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; @@ -2421,7 +2426,7 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) if (!map_type_contains_progs(map)) continue; - if (!bpf_prog_map_compatible(map, fp)) { + if (!__bpf_prog_map_compatible(map, fp)) { ret = -EINVAL; goto out; } @@ -2469,7 +2474,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. */ - bool jit_needed = false; + bool jit_needed = fp->jit_requested; if (fp->bpf_func) goto finalize; diff --git a/kernel/bpf/dmabuf_iter.c b/kernel/bpf/dmabuf_iter.c new file mode 100644 index 000000000000..4dd7ef7c145c --- /dev/null +++ b/kernel/bpf/dmabuf_iter.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Google LLC */ +#include <linux/bpf.h> +#include <linux/btf_ids.h> +#include <linux/dma-buf.h> +#include <linux/kernel.h> +#include <linux/seq_file.h> + +static void *dmabuf_iter_seq_start(struct seq_file *seq, loff_t *pos) +{ + if (*pos) + return NULL; + + return dma_buf_iter_begin(); +} + +static void *dmabuf_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct dma_buf *dmabuf = v; + + ++*pos; + + return dma_buf_iter_next(dmabuf); +} + +struct bpf_iter__dmabuf { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct dma_buf *, dmabuf); +}; + +static int __dmabuf_seq_show(struct seq_file *seq, void *v, bool in_stop) +{ + struct bpf_iter_meta meta = { + .seq = seq, + }; + struct bpf_iter__dmabuf ctx = { + .meta = &meta, + .dmabuf = v, + }; + struct bpf_prog *prog = bpf_iter_get_info(&meta, in_stop); + + if (prog) + return bpf_iter_run_prog(prog, &ctx); + + return 0; +} + +static int dmabuf_iter_seq_show(struct seq_file *seq, void *v) +{ + return __dmabuf_seq_show(seq, v, false); +} + +static void dmabuf_iter_seq_stop(struct seq_file *seq, void *v) +{ + struct dma_buf *dmabuf = v; + + if (dmabuf) + dma_buf_put(dmabuf); +} + +static const struct seq_operations dmabuf_iter_seq_ops = { + .start = dmabuf_iter_seq_start, + .next = dmabuf_iter_seq_next, + .stop = dmabuf_iter_seq_stop, + .show = dmabuf_iter_seq_show, +}; + +static void bpf_iter_dmabuf_show_fdinfo(const struct bpf_iter_aux_info *aux, + struct seq_file *seq) +{ + seq_puts(seq, "dmabuf iter\n"); +} + +static const struct bpf_iter_seq_info dmabuf_iter_seq_info = { + .seq_ops = &dmabuf_iter_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = 0, +}; + +static struct bpf_iter_reg bpf_dmabuf_reg_info = { + .target = "dmabuf", + .feature = BPF_ITER_RESCHED, + .show_fdinfo = bpf_iter_dmabuf_show_fdinfo, + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__dmabuf, dmabuf), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &dmabuf_iter_seq_info, +}; + +DEFINE_BPF_ITER_FUNC(dmabuf, struct bpf_iter_meta *meta, struct dma_buf *dmabuf) +BTF_ID_LIST_SINGLE(bpf_dmabuf_btf_id, struct, dma_buf) + +static int __init dmabuf_iter_init(void) +{ + bpf_dmabuf_reg_info.ctx_arg_info[0].btf_id = bpf_dmabuf_btf_id[0]; + return bpf_iter_reg_target(&bpf_dmabuf_reg_info); +} + +late_initcall(dmabuf_iter_init); + +struct bpf_iter_dmabuf { + /* + * opaque iterator state; having __u64 here allows to preserve correct + * alignment requirements in vmlinux.h, generated from BTF + */ + __u64 __opaque[1]; +} __aligned(8); + +/* Non-opaque version of bpf_iter_dmabuf */ +struct bpf_iter_dmabuf_kern { + struct dma_buf *dmabuf; +} __aligned(8); + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_iter_dmabuf_new(struct bpf_iter_dmabuf *it) +{ + struct bpf_iter_dmabuf_kern *kit = (void *)it; + + BUILD_BUG_ON(sizeof(*kit) > sizeof(*it)); + BUILD_BUG_ON(__alignof__(*kit) != __alignof__(*it)); + + kit->dmabuf = NULL; + return 0; +} + +__bpf_kfunc struct dma_buf *bpf_iter_dmabuf_next(struct bpf_iter_dmabuf *it) +{ + struct bpf_iter_dmabuf_kern *kit = (void *)it; + + if (kit->dmabuf) + kit->dmabuf = dma_buf_iter_next(kit->dmabuf); + else + kit->dmabuf = dma_buf_iter_begin(); + + return kit->dmabuf; +} + +__bpf_kfunc void bpf_iter_dmabuf_destroy(struct bpf_iter_dmabuf *it) +{ + struct bpf_iter_dmabuf_kern *kit = (void *)it; + + if (kit->dmabuf) + dma_buf_put(kit->dmabuf); +} + +__bpf_kfunc_end_defs(); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 92b606d60020..71f9931ac64c 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -175,20 +175,30 @@ static bool htab_is_percpu(const struct bpf_htab *htab) htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } +static inline bool is_fd_htab(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS; +} + +static inline void *htab_elem_value(struct htab_elem *l, u32 key_size) +{ + return l->key + round_up(key_size, 8); +} + static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { - *(void __percpu **)(l->key + roundup(key_size, 8)) = pptr; + *(void __percpu **)htab_elem_value(l, key_size) = pptr; } static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) { - return *(void __percpu **)(l->key + roundup(key_size, 8)); + return *(void __percpu **)htab_elem_value(l, key_size); } static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) { - return *(void **)(l->key + roundup(map->key_size, 8)); + return *(void **)htab_elem_value(l, map->key_size); } static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) @@ -196,9 +206,13 @@ static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size); } +/* Both percpu and fd htab support in-place update, so no need for + * extra elem. LRU itself can remove the least used element, so + * there is no need for an extra elem during map_update. + */ static bool htab_has_extra_elems(struct bpf_htab *htab) { - return !htab_is_percpu(htab) && !htab_is_lru(htab); + return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab); } static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) @@ -215,10 +229,10 @@ static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) elem = get_htab_elem(htab, i); if (btf_record_has_field(htab->map.record, BPF_TIMER)) bpf_obj_free_timer(htab->map.record, - elem->key + round_up(htab->map.key_size, 8)); + htab_elem_value(elem, htab->map.key_size)); if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, - elem->key + round_up(htab->map.key_size, 8)); + htab_elem_value(elem, htab->map.key_size)); cond_resched(); } } @@ -245,7 +259,8 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab) cond_resched(); } } else { - bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); + bpf_obj_free_fields(htab->map.record, + htab_elem_value(elem, htab->map.key_size)); cond_resched(); } cond_resched(); @@ -453,8 +468,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); - bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || - attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); /* percpu_lru means each cpu has its own LRU list. * it is different from BPF_MAP_TYPE_PERCPU_HASH where * the map's value itself is percpu. percpu_lru has @@ -549,10 +562,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (err) goto free_map_locked; - if (!percpu && !lru) { - /* lru itself can remove the least used element, so - * there is no need for an extra elem during map_update. - */ + if (htab_has_extra_elems(htab)) { err = alloc_extra_elems(htab); if (err) goto free_prealloc; @@ -670,7 +680,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) struct htab_elem *l = __htab_map_lookup_elem(map, key); if (l) - return l->key + round_up(map->key_size, 8); + return htab_elem_value(l, map->key_size); return NULL; } @@ -709,7 +719,7 @@ static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, if (l) { if (mark) bpf_lru_node_set_ref(&l->lru_node); - return l->key + round_up(map->key_size, 8); + return htab_elem_value(l, map->key_size); } return NULL; @@ -763,7 +773,7 @@ static void check_and_free_fields(struct bpf_htab *htab, for_each_possible_cpu(cpu) bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); } else { - void *map_value = elem->key + round_up(htab->map.key_size, 8); + void *map_value = htab_elem_value(elem, htab->map.key_size); bpf_obj_free_fields(htab->map.record, map_value); } @@ -968,8 +978,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) { - return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && - BITS_PER_LONG == 64; + return is_fd_htab(htab) && BITS_PER_LONG == 64; } static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, @@ -1039,11 +1048,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, htab_elem_set_ptr(l_new, key_size, pptr); } else if (fd_htab_map_needs_adjust(htab)) { size = round_up(size, 8); - memcpy(l_new->key + round_up(key_size, 8), value, size); + memcpy(htab_elem_value(l_new, key_size), value, size); } else { - copy_map_value(&htab->map, - l_new->key + round_up(key_size, 8), - value); + copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value); } l_new->hash = hash; @@ -1072,10 +1079,9 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new = NULL, *l_old; + struct htab_elem *l_new, *l_old; struct hlist_nulls_head *head; unsigned long flags; - void *old_map_ptr; struct bucket *b; u32 key_size, hash; int ret; @@ -1106,7 +1112,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (l_old) { /* grab the element lock and update value in place */ copy_map_value_locked(map, - l_old->key + round_up(key_size, 8), + htab_elem_value(l_old, key_size), value, false); return 0; } @@ -1134,7 +1140,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, * and update element in place */ copy_map_value_locked(map, - l_old->key + round_up(key_size, 8), + htab_elem_value(l_old, key_size), value, false); ret = 0; goto err; @@ -1156,24 +1162,14 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, hlist_nulls_del_rcu(&l_old->hash_node); /* l_old has already been stashed in htab->extra_elems, free - * its special fields before it is available for reuse. Also - * save the old map pointer in htab of maps before unlock - * and release it after unlock. + * its special fields before it is available for reuse. */ - old_map_ptr = NULL; - if (htab_is_prealloc(htab)) { - if (map->ops->map_fd_put_ptr) - old_map_ptr = fd_htab_map_get_ptr(map, l_old); + if (htab_is_prealloc(htab)) check_and_free_fields(htab, l_old); - } } htab_unlock_bucket(b, flags); - if (l_old) { - if (old_map_ptr) - map->ops->map_fd_put_ptr(map, old_map_ptr, true); - if (!htab_is_prealloc(htab)) - free_htab_elem(htab, l_old); - } + if (l_old && !htab_is_prealloc(htab)) + free_htab_elem(htab, l_old); return 0; err: htab_unlock_bucket(b, flags); @@ -1220,8 +1216,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value l_new = prealloc_lru_pop(htab, key, hash); if (!l_new) return -ENOMEM; - copy_map_value(&htab->map, - l_new->key + round_up(map->key_size, 8), value); + copy_map_value(&htab->map, htab_elem_value(l_new, map->key_size), value); ret = htab_lock_bucket(b, &flags); if (ret) @@ -1255,13 +1250,14 @@ err_lock_bucket: return ret; } -static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, +static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, void *value, u64 map_flags, - bool onallcpus) + bool percpu, bool onallcpus) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new = NULL, *l_old; + struct htab_elem *l_new, *l_old; struct hlist_nulls_head *head; + void *old_map_ptr = NULL; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -1292,21 +1288,29 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, goto err; if (l_old) { - /* per-cpu hash map can update value in-place */ - pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), - value, onallcpus); + /* Update value in-place */ + if (percpu) { + pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), + value, onallcpus); + } else { + void **inner_map_pptr = htab_elem_value(l_old, key_size); + + old_map_ptr = *inner_map_pptr; + WRITE_ONCE(*inner_map_pptr, *(void **)value); + } } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, true, onallcpus, NULL); + hash, percpu, onallcpus, NULL); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; } hlist_nulls_add_head_rcu(&l_new->hash_node, head); } - ret = 0; err: htab_unlock_bucket(b, flags); + if (old_map_ptr) + map->ops->map_fd_put_ptr(map, old_map_ptr, true); return ret; } @@ -1383,7 +1387,7 @@ err_lock_bucket: static long htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { - return __htab_percpu_map_update_elem(map, key, value, map_flags, false); + return htab_map_update_elem_in_place(map, key, value, map_flags, true, false); } static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, @@ -1500,10 +1504,10 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) /* We only free timer on uref dropping to zero */ if (btf_record_has_field(htab->map.record, BPF_TIMER)) bpf_obj_free_timer(htab->map.record, - l->key + round_up(htab->map.key_size, 8)); + htab_elem_value(l, htab->map.key_size)); if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, - l->key + round_up(htab->map.key_size, 8)); + htab_elem_value(l, htab->map.key_size)); } cond_resched_rcu(); } @@ -1615,15 +1619,12 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, off += roundup_value_size; } } else { - u32 roundup_key_size = round_up(map->key_size, 8); + void *src = htab_elem_value(l, map->key_size); if (flags & BPF_F_LOCK) - copy_map_value_locked(map, value, l->key + - roundup_key_size, - true); + copy_map_value_locked(map, value, src, true); else - copy_map_value(map, value, l->key + - roundup_key_size); + copy_map_value(map, value, src); /* Zeroing special fields in the temp buffer */ check_and_init_map_value(map, value); } @@ -1680,12 +1681,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, bool is_percpu) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - u32 bucket_cnt, total, key_size, value_size, roundup_key_size; void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; void __user *uvalues = u64_to_user_ptr(attr->batch.values); void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size, map_id; + u32 bucket_cnt, total, key_size, value_size; struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; @@ -1720,7 +1721,6 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, return -ENOENT; key_size = htab->map.key_size; - roundup_key_size = round_up(htab->map.key_size, 8); value_size = htab->map.value_size; size = round_up(value_size, 8); if (is_percpu) @@ -1812,8 +1812,8 @@ again_nocopy: off += size; } } else { - value = l->key + roundup_key_size; - if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + value = htab_elem_value(l, key_size); + if (is_fd_htab(htab)) { struct bpf_map **inner_map = value; /* Actual value is the id of the inner map */ @@ -2063,11 +2063,11 @@ static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) { struct bpf_iter_seq_hash_map_info *info = seq->private; - u32 roundup_key_size, roundup_value_size; struct bpf_iter__bpf_map_elem ctx = {}; struct bpf_map *map = info->map; struct bpf_iter_meta meta; int ret = 0, off = 0, cpu; + u32 roundup_value_size; struct bpf_prog *prog; void __percpu *pptr; @@ -2077,10 +2077,9 @@ static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) ctx.meta = &meta; ctx.map = info->map; if (elem) { - roundup_key_size = round_up(map->key_size, 8); ctx.key = elem->key; if (!info->percpu_value_buf) { - ctx.value = elem->key + roundup_key_size; + ctx.value = htab_elem_value(elem, map->key_size); } else { roundup_value_size = round_up(map->value_size, 8); pptr = htab_elem_get_ptr(elem, map->key_size); @@ -2165,7 +2164,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ struct hlist_nulls_head *head; struct hlist_nulls_node *n; struct htab_elem *elem; - u32 roundup_key_size; int i, num_elems = 0; void __percpu *pptr; struct bucket *b; @@ -2180,7 +2178,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ is_percpu = htab_is_percpu(htab); - roundup_key_size = round_up(map->key_size, 8); /* migration has been disabled, so percpu value prepared here will be * the same as the one seen by the bpf program with * bpf_map_lookup_elem(). @@ -2196,7 +2193,7 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ pptr = htab_elem_get_ptr(elem, map->key_size); val = this_cpu_ptr(pptr); } else { - val = elem->key + roundup_key_size; + val = htab_elem_value(elem, map->key_size); } num_elems++; ret = callback_fn((u64)(long)map, (u64)(long)key, @@ -2411,8 +2408,8 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, ret = __htab_lru_percpu_map_update_elem(map, key, value, map_flags, true); else - ret = __htab_percpu_map_update_elem(map, key, value, map_flags, - true); + ret = htab_map_update_elem_in_place(map, key, value, map_flags, + true, true); rcu_read_unlock(); return ret; @@ -2536,24 +2533,23 @@ int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) return ret; } -/* only called from syscall */ +/* Only called from syscall */ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags) { void *ptr; int ret; - u32 ufd = *(u32 *)value; - ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); + ptr = map->ops->map_fd_get_ptr(map, map_file, *(int *)value); if (IS_ERR(ptr)) return PTR_ERR(ptr); /* The htab bucket lock is always held during update operations in fd * htab map, and the following rcu_read_lock() is only used to avoid - * the WARN_ON_ONCE in htab_map_update_elem(). + * the WARN_ON_ONCE in htab_map_update_elem_in_place(). */ rcu_read_lock(); - ret = htab_map_update_elem(map, key, &ptr, map_flags); + ret = htab_map_update_elem_in_place(map, key, &ptr, map_flags, false, false); |
