diff options
Diffstat (limited to 'kernel/bpf/syscall.c')
| -rw-r--r-- | kernel/bpf/syscall.c | 250 |
1 files changed, 168 insertions, 82 deletions
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4ff82144f885..a3c0214ca934 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -9,6 +9,7 @@ #include <linux/bpf_verifier.h> #include <linux/bsearch.h> #include <linux/btf.h> +#include <linux/hex.h> #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/sched/signal.h> @@ -133,12 +134,14 @@ bool bpf_map_write_active(const struct bpf_map *map) return atomic64_read(&map->writecnt) != 0; } -static u32 bpf_map_value_size(const struct bpf_map *map) +static u32 bpf_map_value_size(const struct bpf_map *map, u64 flags) { - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || - map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) + if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) + return map->value_size; + else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) return round_up(map->value_size, 8) * num_possible_cpus(); else if (IS_FD_MAP(map)) return sizeof(u32); @@ -314,11 +317,11 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, bpf_disable_instrumentation(); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { - err = bpf_percpu_hash_copy(map, key, value); + err = bpf_percpu_hash_copy(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { - err = bpf_percpu_array_copy(map, key, value); + err = bpf_percpu_array_copy(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { - err = bpf_percpu_cgroup_storage_copy(map, key, value); + err = bpf_percpu_cgroup_storage_copy(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_extract(map, key, value, false); } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { @@ -505,17 +508,29 @@ static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map) return root_mem_cgroup; } +void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg, + struct mem_cgroup **new_memcg) +{ + *new_memcg = bpf_map_get_memcg(map); + *old_memcg = set_active_memcg(*new_memcg); +} + +void bpf_map_memcg_exit(struct mem_cgroup *old_memcg, + struct mem_cgroup *new_memcg) +{ + set_active_memcg(old_memcg); + mem_cgroup_put(new_memcg); +} + void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node) { struct mem_cgroup *memcg, *old_memcg; void *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -526,11 +541,9 @@ void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags struct mem_cgroup *memcg, *old_memcg; void *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -540,11 +553,9 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) struct mem_cgroup *memcg, *old_memcg; void *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = kzalloc(size, flags | __GFP_ACCOUNT); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -555,11 +566,9 @@ void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, struct mem_cgroup *memcg, *old_memcg; void *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -570,11 +579,9 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, struct mem_cgroup *memcg, *old_memcg; void __percpu *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -612,12 +619,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long i, j; struct page *pg; int ret = 0; -#ifdef CONFIG_MEMCG - struct mem_cgroup *memcg, *old_memcg; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); -#endif for (i = 0; i < nr_pages; i++) { pg = __bpf_alloc_page(nid); @@ -631,10 +633,6 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid, break; } -#ifdef CONFIG_MEMCG - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); -#endif return ret; } @@ -943,14 +941,6 @@ static void bpf_map_free_rcu_gp(struct rcu_head *rcu) bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu)); } -static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu) -{ - if (rcu_trace_implies_rcu_gp()) - bpf_map_free_rcu_gp(rcu); - else - call_rcu(rcu, bpf_map_free_rcu_gp); -} - /* decrement map refcnt and schedule it for freeing via workqueue * (underlying map implementation ops->map_free() might sleep) */ @@ -961,8 +951,9 @@ void bpf_map_put(struct bpf_map *map) bpf_map_free_id(map); WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt)); + /* RCU tasks trace grace period implies RCU grace period. */ if (READ_ONCE(map->free_after_mult_rcu_gp)) - call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp); + call_rcu_tasks_trace(&map->rcu, bpf_map_free_rcu_gp); else if (READ_ONCE(map->free_after_rcu_gp)) call_rcu(&map->rcu, bpf_map_free_rcu_gp); else @@ -1236,7 +1227,7 @@ int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) } EXPORT_SYMBOL_GPL(bpf_obj_name_cpy); -int map_check_no_btf(const struct bpf_map *map, +int map_check_no_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) @@ -1366,11 +1357,6 @@ free_map_tab: return ret; } -static bool bpf_net_capable(void) -{ - return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); -} - #define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size /* called via syscall */ static int map_create(union bpf_attr *attr, bpfptr_t uattr) @@ -1734,7 +1720,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) return -EPERM; - err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK | BPF_F_CPU); if (err) return err; @@ -1742,7 +1728,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (IS_ERR(key)) return PTR_ERR(key); - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->flags); err = -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); @@ -1809,7 +1795,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->flags); value = kvmemdup_bpfptr(uvalue, value_size); if (IS_ERR(value)) { err = PTR_ERR(value); @@ -2005,11 +1991,12 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, void *key, *value; int err = 0; - err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, + BPF_F_LOCK | BPF_F_CPU | BPF_F_ALL_CPUS); if (err) return err; - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->batch.elem_flags); max_count = attr->batch.count; if (!max_count) @@ -2064,11 +2051,11 @@ int generic_map_lookup_batch(struct bpf_map *map, u32 value_size, cp, max_count; int err; - err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK | BPF_F_CPU); if (err) return err; - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->batch.elem_flags); max_count = attr->batch.count; if (!max_count) @@ -2190,7 +2177,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) goto err_put; } - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, 0); err = -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); @@ -2820,6 +2807,13 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr void *sig; int err = 0; + /* + * Don't attempt to use kmalloc_large or vmalloc for signatures. + * Practical signature for BPF program should be below this limit. + */ + if (attr->signature_size > KMALLOC_MAX_CACHE_SIZE) + return -EINVAL; + if (system_keyring_id_check(attr->keyring_id) == 0) key = bpf_lookup_system_key(attr->keyring_id); else @@ -2831,7 +2825,7 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr sig = kvmemdup_bpfptr(usig, attr->signature_size); if (IS_ERR(sig)) { bpf_key_put(key); - return -ENOMEM; + return PTR_ERR(sig); } bpf_dynptr_init(&sig_ptr, sig, BPF_DYNPTR_TYPE_LOCAL, 0, @@ -3089,10 +3083,6 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (err < 0) goto free_used_maps; - prog = bpf_prog_select_runtime(prog, &err); - if (err < 0) - goto free_used_maps; - err = bpf_prog_mark_insn_arrays_ready(prog); if (err < 0) goto free_used_maps; @@ -3260,12 +3250,16 @@ static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) bpf_link_dealloc(link); } -static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) +static bool bpf_link_is_tracepoint(struct bpf_link *link) { - if (rcu_trace_implies_rcu_gp()) - bpf_link_defer_dealloc_rcu_gp(rcu); - else - call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp); + /* + * Only these combinations support a tracepoint bpf_link. + * BPF_LINK_TYPE_TRACING raw_tp progs are hardcoded to use + * bpf_raw_tp_link_lops and thus dealloc_deferred(), see + * bpf_raw_tp_link_attach(). + */ + return link->type == BPF_LINK_TYPE_RAW_TRACEPOINT || + (link->type == BPF_LINK_TYPE_TRACING && link->attach_type == BPF_TRACE_RAW_TP); } /* bpf_link_free is guaranteed to be called from process context */ @@ -3278,16 +3272,26 @@ static void bpf_link_free(struct bpf_link *link) if (link->prog) ops->release(link); if (ops->dealloc_deferred) { - /* Schedule BPF link deallocation, which will only then + /* + * Schedule BPF link deallocation, which will only then * trigger putting BPF program refcount. * If underlying BPF program is sleepable or BPF link's target * attach hookpoint is sleepable or otherwise requires RCU GPs * to ensure link and its underlying BPF program is not * reachable anymore, we need to first wait for RCU tasks - * trace sync, and then go through "classic" RCU grace period + * trace sync, and then go through "classic" RCU grace period. + * + * For tracepoint BPF links, we need to go through SRCU grace + * period wait instead when non-faultable tracepoint is used. We + * don't need to chain SRCU grace period waits, however, for the + * faultable case, since it exclusively uses RCU Tasks Trace. */ if (link->sleepable || (link->prog && link->prog->sleepable)) - call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); + /* RCU Tasks Trace grace period implies RCU grace period. */ + call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_rcu_gp); + /* We need to do a SRCU grace period wait for non-faultable tracepoint BPF links. */ + else if (bpf_link_is_tracepoint(link)) + call_tracepoint_unregister_atomic(&link->rcu, bpf_link_defer_dealloc_rcu_gp); else call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); } else if (ops->dealloc) { @@ -3579,6 +3583,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, case BPF_PROG_TYPE_TRACING: if (prog->expected_attach_type != BPF_TRACE_FENTRY && prog->expected_attach_type != BPF_TRACE_FEXIT && + prog->expected_attach_type != BPF_TRACE_FSESSION && prog->expected_attach_type != BPF_MODIFY_RETURN) { err = -EINVAL; goto out_put_prog; @@ -3628,7 +3633,21 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); } - link = kzalloc(sizeof(*link), GFP_USER); + if (prog->expected_attach_type == BPF_TRACE_FSESSION) { + struct bpf_fsession_link *fslink; + + fslink = kzalloc_obj(*fslink, GFP_USER); + if (fslink) { + bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog, attach_type); + fslink->fexit.cookie = bpf_cookie; + link = &fslink->link; + } else { + link = NULL; + } + } else { + link = kzalloc_obj(*link, GFP_USER); + } if (!link) { err = -ENOMEM; goto out_put_prog; @@ -3717,6 +3736,23 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, tr = prog->aux->dst_trampoline; tgt_prog = prog->aux->dst_prog; } + /* + * It is to prevent modifying struct pt_regs via kprobe_write_ctx=true + * freplace prog. Without this check, kprobe_write_ctx=true freplace + * prog is allowed to attach to kprobe_write_ctx=false kprobe prog, and + * then modify the registers of the kprobe prog's target kernel + * function. + * + * This also blocks the combination of uprobe+freplace, because it is + * unable to recognize the use of the tgt_prog as an uprobe or a kprobe + * by tgt_prog itself. At attach time, uprobe/kprobe is recognized by + * the target perf event flags in __perf_event_set_bpf_prog(). + */ + if (prog->type == BPF_PROG_TYPE_EXT && + prog->aux->kprobe_write_ctx != tgt_prog->aux->kprobe_write_ctx) { + err = -EINVAL; + goto out_unlock; + } err = bpf_link_prime(&link->link.link, &link_primer); if (err) @@ -4167,7 +4203,7 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro if (IS_ERR(perf_file)) return PTR_ERR(perf_file); - link = kzalloc(sizeof(*link), GFP_USER); + link = kzalloc_obj(*link, GFP_USER); if (!link) { err = -ENOMEM; goto out_put_file; @@ -4245,7 +4281,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, if (!btp) return -ENOENT; - link = kzalloc(sizeof(*link), GFP_USER); + link = kzalloc_obj(*link, GFP_USER); if (!link) { err = -ENOMEM; goto out_put_btp; @@ -4352,6 +4388,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_TRACE_RAW_TP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: case BPF_MODIFY_RETURN: return BPF_PROG_TYPE_TRACING; case BPF_LSM_MAC: @@ -4565,6 +4602,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) return PTR_ERR(prog); + } else if (!bpf_mprog_detach_empty(ptype)) { + return -EPERM; } } else if (is_cgroup_prog_type(ptype, 0, false)) { if (attr->attach_flags || attr->relative_fd) @@ -5310,6 +5349,9 @@ static int bpf_map_get_info_by_fd(struct file *file, if (info.hash_size != SHA256_DIGEST_SIZE) return -EINVAL; + if (!READ_ONCE(map->frozen)) + return -EPERM; + err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha); if (err != 0) return err; @@ -6054,9 +6096,8 @@ static int bpf_prog_bind_map(union bpf_attr *attr) goto out_unlock; } - used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1, - sizeof(used_maps_new[0]), - GFP_KERNEL); + used_maps_new = kmalloc_objs(used_maps_new[0], + prog->aux->used_map_cnt + 1); if (!used_maps_new) { ret = -ENOMEM; goto out_unlock; @@ -6122,6 +6163,49 @@ static int prog_stream_read(union bpf_attr *attr) return ret; } +#define BPF_PROG_ASSOC_STRUCT_OPS_LAST_FIELD prog_assoc_struct_ops.prog_fd + +static int prog_assoc_struct_ops(union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct bpf_map *map; + int ret; + + if (CHECK_ATTR(BPF_PROG_ASSOC_STRUCT_OPS)) + return -EINVAL; + + if (attr->prog_assoc_struct_ops.flags) + return -EINVAL; + + prog = bpf_prog_get(attr->prog_assoc_struct_ops.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) { + ret = -EINVAL; + goto put_prog; + } + + map = bpf_map_get(attr->prog_assoc_struct_ops.map_fd); + if (IS_ERR(map)) { + ret = PTR_ERR(map); + goto put_prog; + } + + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) { + ret = -EINVAL; + goto put_map; + } + + ret = bpf_prog_assoc_struct_ops(prog, map); + +put_map: + bpf_map_put(map); +put_prog: + bpf_prog_put(prog); + return ret; +} + static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; @@ -6261,6 +6345,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) case BPF_PROG_STREAM_READ_BY_FD: err = prog_stream_read(&attr); break; + case BPF_PROG_ASSOC_STRUCT_OPS: + err = prog_assoc_struct_ops(&attr); + break; default: err = -EINVAL; break; @@ -6281,8 +6368,7 @@ static bool syscall_prog_is_valid_access(int off, int size, { if (off < 0 || off >= U16_MAX) return false; - if (off % size != 0) - return false; + /* No alignment requirements for syscall ctx accesses. */ return true; } @@ -6407,7 +6493,7 @@ static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .func = bpf_kallsyms_lookup_name, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, |
