diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/bpf/bpf_iter.c | 2 | ||||
| -rw-r--r-- | kernel/bpf/bpf_lru_list.c | 7 | ||||
| -rw-r--r-- | kernel/bpf/btf.c | 76 | ||||
| -rw-r--r-- | kernel/bpf/cgroup.c | 120 | ||||
| -rw-r--r-- | kernel/bpf/core.c | 18 | ||||
| -rw-r--r-- | kernel/bpf/cpumap.c | 46 | ||||
| -rw-r--r-- | kernel/bpf/devmap.c | 4 | ||||
| -rw-r--r-- | kernel/bpf/disasm.c | 2 | ||||
| -rw-r--r-- | kernel/bpf/hashtab.c | 4 | ||||
| -rw-r--r-- | kernel/bpf/helpers.c | 12 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 16 | ||||
| -rw-r--r-- | kernel/bpf/task_iter.c | 267 | ||||
| -rw-r--r-- | kernel/bpf/trampoline.c | 77 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 877 | ||||
| -rw-r--r-- | kernel/trace/bpf_trace.c | 6 |
15 files changed, 1193 insertions, 341 deletions
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 5454161407f1..a0d9eade9c80 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -287,7 +287,7 @@ int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; - tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + tinfo = kzalloc(sizeof(*tinfo), GFP_KERNEL); if (!tinfo) return -ENOMEM; diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 1b6b9349cb85..d99e89f113c4 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -502,13 +502,14 @@ struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash) static void bpf_common_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) { + u8 node_type = READ_ONCE(node->type); unsigned long flags; - if (WARN_ON_ONCE(node->type == BPF_LRU_LIST_T_FREE) || - WARN_ON_ONCE(node->type == BPF_LRU_LOCAL_LIST_T_FREE)) + if (WARN_ON_ONCE(node_type == BPF_LRU_LIST_T_FREE) || + WARN_ON_ONCE(node_type == BPF_LRU_LOCAL_LIST_T_FREE)) return; - if (node->type == BPF_LRU_LOCAL_LIST_T_PENDING) { + if (node_type == BPF_LRU_LOCAL_LIST_T_PENDING) { struct bpf_lru_locallist *loc_l; loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8962f988514f..2efeb5f4b343 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3540,11 +3540,6 @@ static s32 btf_datasec_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (!btf_type_vlen(t)) { - btf_verifier_log_type(env, t, "vlen == 0"); - return -EINVAL; - } - if (!t->size) { btf_verifier_log_type(env, t, "size == 0"); return -EINVAL; @@ -5296,15 +5291,16 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. */ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *reg) + struct bpf_reg_state *regs) { struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; struct btf *btf = prog->aux->btf; const struct btf_param *args; - const struct btf_type *t; - u32 i, nargs, btf_id; + const struct btf_type *t, *ref_t; + u32 i, nargs, btf_id, type_size; const char *tname; + bool is_global; if (!prog->aux->func_info) return -EINVAL; @@ -5338,38 +5334,57 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs); goto out; } + + is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; /* check that BTF function arguments match actual types that the * verifier sees. */ for (i = 0; i < nargs; i++) { + struct bpf_reg_state *reg = ®s[i + 1]; + t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (btf_type_is_int(t) || btf_type_is_enum(t)) { - if (reg[i + 1].type == SCALAR_VALUE) + if (reg->type == SCALAR_VALUE) continue; bpf_log(log, "R%d is not a scalar\n", i + 1); goto out; } if (btf_type_is_ptr(t)) { - if (reg[i + 1].type == SCALAR_VALUE) { - bpf_log(log, "R%d is not a pointer\n", i + 1); - goto out; - } /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) { - if (reg[i + 1].type != PTR_TO_CTX) { + if (reg->type != PTR_TO_CTX) { bpf_log(log, "arg#%d expected pointer to ctx, but got %s\n", i, btf_kind_str[BTF_INFO_KIND(t->info)]); goto out; } - if (check_ctx_reg(env, ®[i + 1], i + 1)) + if (check_ctx_reg(env, reg, i + 1)) goto out; continue; } + + if (!is_global) + goto out; + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + ref_t = btf_resolve_size(btf, t, &type_size); + if (IS_ERR(ref_t)) { + bpf_log(log, + "arg#%d reference type('%s %s') size cannot be determined: %ld\n", + i, btf_type_str(t), btf_name_by_offset(btf, t->name_off), + PTR_ERR(ref_t)); + goto out; + } + + if (check_mem_reg(env, reg, i + 1, type_size)) + goto out; + + continue; } bpf_log(log, "Unrecognized arg#%d type %s\n", i, btf_kind_str[BTF_INFO_KIND(t->info)]); @@ -5393,14 +5408,14 @@ out: * (either PTR_TO_CTX or SCALAR_VALUE). */ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *reg) + struct bpf_reg_state *regs) { struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; enum bpf_prog_type prog_type = prog->type; struct btf *btf = prog->aux->btf; const struct btf_param *args; - const struct btf_type *t; + const struct btf_type *t, *ref_t; u32 i, nargs, btf_id; const char *tname; @@ -5464,16 +5479,35 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, * Only PTR_TO_CTX and SCALAR are supported atm. */ for (i = 0; i < nargs; i++) { + struct bpf_reg_state *reg = ®s[i + 1]; + t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (btf_type_is_int(t) || btf_type_is_enum(t)) { - reg[i + 1].type = SCALAR_VALUE; + reg->type = SCALAR_VALUE; continue; } - if (btf_type_is_ptr(t) && - btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { - reg[i + 1].type = PTR_TO_CTX; + if (btf_type_is_ptr(t)) { + if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { + reg->type = PTR_TO_CTX; + continue; + } + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + ref_t = btf_resolve_size(btf, t, ®->mem_size); + if (IS_ERR(ref_t)) { + bpf_log(log, + "arg#%d reference type('%s %s') size cannot be determined: %ld\n", + i, btf_type_str(t), btf_name_by_offset(btf, t->name_off), + PTR_ERR(ref_t)); + return -EINVAL; + } + + reg->type = PTR_TO_MEM_OR_NULL; + reg->id = ++env->id_gen; + continue; } bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 6aa9e10c6335..b567ca46555c 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -19,7 +19,7 @@ #include "../cgroup/cgroup-internal.h" -DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); +DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); void cgroup_bpf_offline(struct cgroup *cgrp) @@ -128,7 +128,7 @@ static void cgroup_bpf_release(struct work_struct *work) if (pl->link) bpf_cgroup_link_auto_detach(pl->link); kfree(pl); - static_branch_dec(&cgroup_bpf_enabled_key); + static_branch_dec(&cgroup_bpf_enabled_key[type]); } old_array = rcu_dereference_protected( cgrp->bpf.effective[type], @@ -499,7 +499,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, if (old_prog) bpf_prog_put(old_prog); else - static_branch_inc(&cgroup_bpf_enabled_key); + static_branch_inc(&cgroup_bpf_enabled_key[type]); bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; @@ -698,7 +698,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, cgrp->bpf.flags[type] = 0; if (old_prog) bpf_prog_put(old_prog); - static_branch_dec(&cgroup_bpf_enabled_key); + static_branch_dec(&cgroup_bpf_enabled_key[type]); return 0; cleanup: @@ -1055,6 +1055,8 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * @uaddr: sockaddr struct provided by user * @type: The type of program to be exectuted * @t_ctx: Pointer to attach type specific context + * @flags: Pointer to u32 which contains higher bits of BPF program + * return value (OR'ed together). * * socket is expected to be of type INET or INET6. * @@ -1064,7 +1066,8 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, enum bpf_attach_type type, - void *t_ctx) + void *t_ctx, + u32 *flags) { struct bpf_sock_addr_kern ctx = { .sk = sk, @@ -1087,7 +1090,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + ret = BPF_PROG_RUN_ARRAY_FLAGS(cgrp->bpf.effective[type], &ctx, + BPF_PROG_RUN, flags); return ret == 1 ? 0 : -EPERM; } @@ -1298,7 +1302,8 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, return empty; } -static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen, + struct bpf_sockopt_buf *buf) { if (unlikely(max_optlen < 0)) return -EINVAL; @@ -1310,6 +1315,15 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) max_optlen = PAGE_SIZE; } + if (max_optlen <= sizeof(buf->data)) { + /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE + * bytes avoid the cost of kzalloc. + */ + ctx->optval = buf->data; + ctx->optval_end = ctx->optval + max_optlen; + return max_optlen; + } + ctx->optval = kzalloc(max_optlen, GFP_USER); if (!ctx->optval) return -ENOMEM; @@ -1319,16 +1333,26 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) return max_optlen; } -static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx, + struct bpf_sockopt_buf *buf) { + if (ctx->optval == buf->data) + return; kfree(ctx->optval); } +static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx, + struct bpf_sockopt_buf *buf) +{ + return ctx->optval != buf->data; +} + int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, int *optname, char __user *optval, int *optlen, char **kernel_optval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_buf buf = {}; struct bpf_sockopt_kern ctx = { .sk = sk, .level = *level, @@ -1340,8 +1364,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (!cgroup_bpf_enabled || - __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) return 0; /* Allocate a bit more than the initial user buffer for @@ -1350,7 +1373,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, */ max_optlen = max_t(int, 16, *optlen); - max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1390,14 +1413,31 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, */ if (ctx.optlen != 0) { *optlen = ctx.optlen; - *kernel_optval = ctx.optval; + /* We've used bpf_sockopt_kern->buf as an intermediary + * storage, but the BPF program indicates that we need + * to pass this data to the kernel setsockopt handler. + * No way to export on-stack buf, have to allocate a + * new buffer. + */ + if (!sockopt_buf_allocated(&ctx, &buf)) { + void *p = kmalloc(ctx.optlen, GFP_USER); + + if (!p) { + ret = -ENOMEM; + goto out; + } + memcpy(p, ctx.optval, ctx.optlen); + *kernel_optval = p; + } else { + *kernel_optval = ctx.optval; + } /* export and don't free sockopt buf */ return 0; } } out: - sockopt_free_buf(&ctx); + sockopt_free_buf(&ctx, &buf); return ret; } @@ -1407,6 +1447,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, int retval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_buf buf = {}; struct bpf_sockopt_kern ctx = { .sk = sk, .level = level, @@ -1419,13 +1460,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (!cgroup_bpf_enabled || - __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) return retval; ctx.optlen = max_optlen; - max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1488,9 +1528,55 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, ret = ctx.retval; out: - sockopt_free_buf(&ctx); + sockopt_free_buf(&ctx, &buf); return ret; } + +int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, + int optname, void *optval, + int *optlen, int retval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = level, + .optname = optname, + .retval = retval, + .optlen = *optlen, + .optval = optval, + .optval_end = optval + *optlen, + }; + int ret; + + /* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy + * user data back into BPF buffer when reval != 0. This is + * done as an optimization to avoid extra copy, assuming + * kernel won't populate the data in case of an error. + * Here we always pass the data and memset() should + * be called if that data shouldn't be "exported". + */ + + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + &ctx, BPF_PROG_RUN); + if (!ret) + return -EPERM; + + if (ctx.optlen > *optlen) + return -EFAULT; + + /* BPF programs only allowed to set retval to 0, not some + * arbitrary value. + */ + if (ctx.retval != 0 && ctx.retval != retval) + return -EFAULT; + + /* BPF programs can shrink the buffer, export the modifications. + */ + if (ctx.optlen != 0) + *optlen = ctx.optlen; + + return ctx.retval; +} #endif static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5bbd4884ff7a..0ae015ad1e05 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -91,6 +91,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag vfree(fp); return NULL; } + fp->active = alloc_percpu_gfp(int, GFP_KERNEL_ACCOUNT | gfp_extra_flags); + if (!fp->active) { + vfree(fp); + kfree(aux); + return NULL; + } fp->pages = size / PAGE_SIZE; fp->aux = aux; @@ -114,8 +120,9 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (!prog) return NULL; - prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); - if (!prog->aux->stats) { + prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); + if (!prog->stats) { + free_percpu(prog->active); kfree(prog->aux); vfree(prog); return NULL; @@ -124,7 +131,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) for_each_possible_cpu(cpu) { struct bpf_prog_stats *pstats; - pstats = per_cpu_ptr(prog->aux->stats, cpu); + pstats = per_cpu_ptr(prog->stats, cpu); u64_stats_init(&pstats->syncp); } return prog; @@ -238,6 +245,8 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, * reallocated structure. */ fp_old->aux = NULL; + fp_old->stats = NULL; + fp_old->active = NULL; __bpf_prog_free(fp_old); } @@ -249,10 +258,11 @@ void __bpf_prog_free(struct bpf_prog *fp) if (fp->aux) { mutex_destroy(&fp->aux->used_maps_mutex); mutex_destroy(&fp->aux->dst_mutex); - free_percpu(fp->aux->stats); kfree(fp->aux->poke_tab); kfree(fp->aux); } + free_percpu(fp->stats); + free_percpu(fp->active); vfree(fp); } diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 747313698178..5d1469de6921 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -141,49 +141,6 @@ static void cpu_map_kthread_stop(struct work_struct *work) kthread_stop(rcpu->kthread); } -static struct sk_buff *cpu_map_build_skb(struct xdp_frame *xdpf, - struct sk_buff *skb) -{ - unsigned int hard_start_headroom; - unsigned int frame_size; - void *pkt_data_start; - - /* Part of headroom was reserved to xdpf */ - hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; - - /* Memory size backing xdp_frame data already have reserved - * room for build_skb to place skb_shared_info in tailroom. - */ - frame_size = xdpf->frame_sz; - - pkt_data_start = xdpf->data - hard_start_headroom; - skb = build_skb_around(skb, pkt_data_start, frame_size); - if (unlikely(!skb)) - return NULL; - - skb_reserve(skb, hard_start_headroom); - __skb_put(skb, xdpf->len); - if (xdpf->metasize) - skb_metadata_set(skb, xdpf->metasize); - - /* Essential SKB info: protocol and skb->dev */ - skb->protocol = eth_type_trans(skb, xdpf->dev_rx); - - /* Optional SKB info, currently missing: - * - HW checksum info (skb->ip_summed) - * - HW RX hash (skb_set_hash) - * - RX ring dev queue index (skb_record_rx_queue) - */ - - /* Until page_pool get SKB return path, release DMA here */ - xdp_release_frame(xdpf); - - /* Allow SKB to reuse area used by xdp_frame */ - xdp_scrub_frame(xdpf); - - return skb; -} - static void __cpu_map_ring_cleanup(struct ptr_ring *ring) { /* The tear-down procedure should have made sure that queue is @@ -350,7 +307,8 @@ static int cpu_map_kthread_run(void *data) struct sk_buff *skb = skbs[i]; int ret; - skb = cpu_map_build_skb(xdpf, skb); + skb = __xdp_build_skb_from_frame(xdpf, skb, + xdpf->dev_rx); if (!skb) { xdp_return_frame(xdpf); continue; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f6e9c68afdd4..85d9d1b72a33 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -802,9 +802,7 @@ static int dev_map_notification(struct notifier_block *notifier, break; /* will be freed in free_netdev() */ - netdev->xdp_bulkq = - __alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue), - sizeof(void *), GFP_ATOMIC); + netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue); if (!netdev->xdp_bulkq) return NOTIFY_BAD; diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 19ff8fed7f4b..3acc7e0b6916 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -161,7 +161,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, insn->dst_reg, insn->off, insn->src_reg); else if (BPF_MODE(insn->code) == BPF_ATOMIC && - (insn->imm == BPF_ADD || insn->imm == BPF_ADD || + (insn->imm == BPF_ADD || insn->imm == BPF_AND || insn->imm == BPF_OR || insn->imm == BPF_XOR)) { verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) %s r%d\n", insn->code, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c1ac7f964bc9..d63912e73ad9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1148,7 +1148,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1202,7 +1202,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 41ca280b1dc1..308427fe03a3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -720,14 +720,6 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_spin_lock_proto; case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; - case BPF_FUNC_trace_printk: - if (!perfmon_capable()) - return NULL; - return bpf_get_trace_printk_proto(); - case BPF_FUNC_snprintf_btf: - if (!perfmon_capable()) - return NULL; - return &bpf_snprintf_btf_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; case BPF_FUNC_per_cpu_ptr: @@ -742,6 +734,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return NULL; switch (func_id) { + case BPF_FUNC_trace_printk: + return bpf_get_trace_printk_proto(); case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; case BPF_FUNC_probe_read_user: @@ -752,6 +746,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: return &bpf_probe_read_kernel_str_proto; + case BPF_FUNC_snprintf_btf: + return &bpf_snprintf_btf_proto; default: return NULL; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e5999d86c76e..c859bc46d06c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1731,25 +1731,28 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) static void bpf_prog_get_stats(const struct bpf_prog *prog, struct bpf_prog_stats *stats) { - u64 nsecs = 0, cnt = 0; + u64 nsecs = 0, cnt = 0, misses = 0; int cpu; for_each_possible_cpu(cpu) { const struct bpf_prog_stats *st; unsigned int start; - u64 tnsecs, tcnt; + u64 tnsecs, tcnt, tmisses; - st = per_cpu_ptr(prog->aux->stats, cpu); + st = per_cpu_ptr(prog->stats, cpu); do { start = u64_stats_fetch_begin_irq(&st->syncp); tnsecs = st->nsecs; tcnt = st->cnt; + tmisses = st->misses; } while (u64_stats_fetch_retry_irq(&st->syncp, start)); nsecs += tnsecs; cnt += tcnt; + misses += tmisses; } stats->nsecs = nsecs; stats->cnt = cnt; + stats->misses = misses; } #ifdef CONFIG_PROC_FS @@ -1768,14 +1771,16 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) "memlock:\t%llu\n" "prog_id:\t%u\n" "run_time_ns:\t%llu\n" - "run_cnt:\t%llu\n", + "run_cnt:\t%llu\n" + "recursion_misses:\t%llu\n", prog->type, prog->jited, prog_tag, prog->pages * 1ULL << PAGE_SHIFT, prog->aux->id, stats.nsecs, - stats.cnt); + stats.cnt, + stats.misses); } #endif @@ -3438,6 +3443,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, bpf_prog_get_stats(prog, &stats); info.run_time_ns = stats.nsecs; info.run_cnt = stats.cnt; + info.recursion_misses = stats.misses; if (!bpf_capable()) { info.jited_prog_len = 0; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 175b7b42bfc4..b68cb5d6d6eb 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -286,9 +286,248 @@ static const struct seq_operations task_file_seq_ops = { .show = task_file_seq_show, }; +struct bpf_iter_seq_task_vma_info { + /* The first field must be struct bpf_iter_seq_task_common. + * this is assumed by {init, fini}_seq_pidns() callback functions. + */ + struct bpf_iter_seq_task_common common; + struct task_struct *task; + struct vm_area_struct *vma; + u32 tid; + unsigned long prev_vm_start; + unsigned long prev_vm_end; +}; + +enum bpf_task_vma_iter_find_op { + task_vma_iter_first_vma, /* use mm->mmap */ + task_vma_iter_next_vma, /* use curr_vma->vm_next */ + task_vma_iter_find_vma, /* use find_vma() to find next vma */ +}; + +static struct vm_area_struct * +task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) +{ + struct pid_namespace *ns = info->common.ns; + enum bpf_task_vma_iter_find_op op; + struct vm_area_struct *curr_vma; + struct task_struct *curr_task; + u32 curr_tid = info->tid; + + /* If this function returns a non-NULL vma, it holds a reference to + * the task_struct, and holds read lock on vma->mm->mmap_lock. + * If this function returns NULL, it does not hold any reference or + * lock. + */ + if (info->task) { + curr_task = info->task; + curr_vma = info->vma; + /* In case of lock contention, drop mmap_lock to unblock + * the writer. + * + * After relock, call find(mm, prev_vm_end - 1) to find + * new vma to process. + * + * +------+------+-----------+ + * | VMA1 | VMA2 | VMA3 | + * +------+------+-----------+ + * | | | | + * 4k 8k 16k 400k + * + * For example, curr_vma == VMA2. Before unlock, we set + * + * prev_vm_start = 8k + * prev_vm_end = 16k + * + * There are a few cases: + * + * 1) VMA2 is freed, but VMA3 exists. + * + * find_vma() will return VMA3, just process VMA3. + * + * 2) VMA2 still exists. + * + * find_vma() will return VMA2, process VMA2->next. + * + * 3) no more vma in this mm. + * + * Process the next task. + * + * 4) find_vma() returns a different vma, VMA2'. + * + * 4.1) If VMA2 covers same range as VMA2', skip VMA2', + * because we already covered the range; + * 4.2) VMA2 and VMA2' covers different ranges, process + * VMA2'. + */ + if (mmap_lock_is_contended(curr_task->mm)) { + info->prev_vm_start = curr_vma->vm_start; + info->prev_vm_end = curr_vma->vm_end; + op = task_vma_iter_find_vma; + mmap_read_unlock(curr_task->mm); + if (mmap_read_lock_killable(curr_task->mm)) + goto finish; + } else { + op = task_vma_iter_next_vma; + } + } else { +again: + curr_task = task_seq_get_next(ns, &curr_tid, true); + if (!curr_task) { + info->tid = curr_tid + 1; + goto finish; + } + + if (curr_tid != info->tid) { + info->tid = curr_tid; + /* new task, process the first vma */ + op = task_vma_iter_first_vma; + } else { + /* Found the same tid, which means the user space + * finished data in previous buffer and read more. + * We dropped mmap_lock before returning to user + * space, so it is necessary to use find_vma() to + * find the next vma to process. + */ + op = task_vma_iter_find_vma; + } + + if (!curr_task->mm) + goto next_task; + + if (mmap_read_lock_killable(curr_task->mm)) + goto finish; + } + + switch (op) { + case task_vma_iter_first_vma: + curr_vma = curr_task->mm->mmap; + break; + case task_vma_iter_next_vma: + curr_vma = curr_vma->vm_next; + break; + case task_vma_iter_find_vma: + /* We dropped mmap_lock so it is necessary to use find_vma + * to find the next vma. This is similar to the mechanism + * in show_smaps_rollup(). + */ + curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1); + /* case 1) and 4.2) above just use curr_vma */ + + /* check for case 2) or case 4.1) above */ + if (curr_vma && + curr_vma->vm_start == info->prev_vm_start && + curr_vma->vm_end == info->prev_vm_end) + curr_vma = curr_vma->vm_next; + break; + } + if (!curr_vma) { + /* case 3) above, or case 2) 4.1) with vma->next == NULL */ + mmap_read_unlock(curr_task->mm); + goto next_task; + } + info->task = curr_task; + info->vma = curr_vma; + return curr_vma; + +next_task: + put_task_struct(curr_task); + info->task = NULL; + curr_tid++; + goto again; + +finish: + if (curr_task) + put_task_struct(curr_task); + info->task = NULL; + info->vma = NULL; + return NULL; +} + +static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_task_vma_info *info = seq->private; + struct vm_area_struct *vma; + + vma = task_vma_seq_get_next(info); + if (vma && *pos == 0) + ++*pos; + + return vma; +} + +static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_task_vma_info *info = seq->private; + + ++*pos; + return task_vma_seq_get_next(info); +} + +struct bpf_iter__task_vma { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct task_struct *, task); + __bpf_md_ptr(struct vm_area_struct *, vma); +}; + +DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta, + struct task_struct *task, struct vm_area_struct *vma) + +static int __task_vma_seq_show(struct seq_file *seq, bool in_stop) +{ + struct bpf_iter_seq_task_vma_info *info = seq->private; + struct bpf_iter__task_vma ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.task = info->task; + ctx.vma = info->vma; + return bpf_iter_run_prog(prog, &ctx); +} + +static int task_vma_seq_show(struct seq_file *seq, void *v) +{ + return __task_vma_seq_show(seq, false); +} + +static void task_vma_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_task_vma_info *info = seq->private; + + if (!v) { + (void)__task_vma_seq_show(seq, true); + } else { + /* info->vma has not been seen by the BPF program. If the + * user space reads more, task_vma_seq_get_next should + * return this vma again. Set prev_vm_start to ~0UL, + * so that we don't skip the vma returned by the next + * find_vma() (case task_vma_iter_find_vma in + * task_vma_seq_get_next()). + */ + info->prev_vm_start = ~0UL; + info->prev_vm_end = info->vma->vm_end; + mmap_read_unlock(info->task->mm); + put_task_struct(info->task); + info->task = NULL; + } +} + +static const struct seq_operations task_vma_seq_ops = { + .start = task_vma_seq_start, + .next = task_vma_seq_next, + .stop = task_vma_seq_stop, + .show = task_vma_seq_show, +}; + BTF_ID_LIST(btf_task_file_ids) BTF_ID(struct, task_struct) BTF_ID(struct, file) +BTF_ID(struct, vm_area_struct) static const struct bpf_iter_seq_info task_seq_info = { .seq_ops = &task_seq_ops, @@ -328,6 +567,26 @@ static struct bpf_iter_reg task_file_reg_info = { .seq_info = &task_file_seq_info, }; +static const struct bpf_iter_seq_info task_vma_seq_info = { + .seq_ops = |
